- Small changes to MARC record support - Implemented loadDocument API, for loading and parsing the DOMs of HTML documents in the background - Added scraper code to SVN repository (now includes 12 scrapers, see Writeboard for details) - www - Unnamed repository; edit this file 'description' to name the repository.

commit 152c9bf9e7ee421d8e56db29adef7680b392c096
parent 6c55e63eab9ea6b538105b76180aae3de6bc4270
Author: Simon Kornblith <simon@simonster.com>
Date:   Tue,  6 Jun 2006 18:25:45 +0000

- Small changes to MARC record support
- Implemented loadDocument API, for loading and parsing the DOMs of HTML documents in the background
- Added scraper code to SVN repository (now includes 12 scrapers, see Writeboard for details)

To update to the latest versions of all scrapers, ensure you have an up-to-date version of sqlite3, then run:
sqlite3 ~/Library/Application\ Support/Firefox/Profiles/profileName/scholar.sqlite < scrapers.sql



Diffstat:
M chrome/chromeFiles/content/scholar/ingester/browser.js  | 3 ++-
M chrome/chromeFiles/content/scholar/ingester/browser.xul  | 3 +++
M chrome/chromeFiles/content/scholar/xpcom/ingester.js  | 217 +++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M chrome/chromeFiles/content/scholar/xpcom/marc.js  | 53 +++++++++++++++++++++++++++++++++++++++++++++--------
A scrapers.sql  | 1015 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

5 files changed, 1206 insertions(+), 85 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -35,6 +35,7 @@ Scholar.Ingester.Interface.init = function() {
  */
 Scholar.Ingester.Interface.chromeLoad = function() {
 	Scholar.Ingester.Interface.tabBrowser = document.getElementById("content");
+	Scholar.Ingester.Interface.hiddenBrowser = document.getElementById("scholar-hidden-browser");
 	Scholar.Ingester.Interface.appContent = document.getElementById("appcontent");
 	Scholar.Ingester.Interface.statusImage = document.getElementById("scholar-status-image");
 	
@@ -189,7 +190,7 @@ Scholar.Ingester.Interface._setDocument = function(browser) {
 			browser.setAttribute("scholar-key", key);
 		}
 	}
-	Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser);
+	Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, Scholar.Ingester.Interface.hiddenBrowser);
 	Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
 }
 
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.xul b/chrome/chromeFiles/content/scholar/ingester/browser.xul
@@ -19,4 +19,7 @@
             <image id="scholar-status-image" width="16" height="16" onclick="Scholar.Ingester.Interface.scrapeThisPage()" />
          </statusbarpanel>
     </statusbar>
+	<box style="visibility: collapse">
+		<browser id="scholar-hidden-browser" />
+	</box>
 </overlay>
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -48,7 +48,9 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
 /////////////////////////////////////////////////////////////////
 // Scholar.Ingester.Utilities class, a set of methods to assist in data
 // extraction. Most code here was stolen directly from the Piggy Bank project.
-Scholar.Ingester.Utilities = function() {}
+Scholar.Ingester.Utilities = function(hiddenBrowser) {
+	this.hiddenBrowser = hiddenBrowser;
+}
 
 // Adapter for Piggy Bank function to print debug messages; log level is
 // fixed at 4 (could change this)
@@ -99,6 +101,7 @@ Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, paren
 // Loads a single document for a scraper, running succeeded() on success or
 // failed() on failure
 Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {
+	Scholar.debug("loadDocument called");
 	this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
 }
 
@@ -112,6 +115,9 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
 // exception - a function to execute if an exception occurs (exceptions are
 //             also logged in the Firefox Scholar log)
 Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
+	var hiddenBrowser = this.hiddenBrowser;
+	Scholar.debug("processDocuments called");
+	
 	try {
 		if (urls.length == 0) {
 			if (firstDoc) {
@@ -128,53 +134,51 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
 			if (urlIndex < urls.length) {
 				try {
 					var url = urls[urlIndex];
-					var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
-					b.loadURI(url);
+					Scholar.debug("loading "+url);
+					hiddenBrowser.loadURI(url);
 				} catch (e) {
-					exception(e);
 					Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);
+					exception(e);
 				}
 			} else {
-				window.setTimeout(done, 10);
+				hiddenBrowser.setTimeout(done, 10);
 			}
 		};
 		var onLoad = function() {
-			try {
-				var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser;
-				processor(b.contentDocument, doLoad);
-			} catch (e) {
-				exception(e);
-				Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
+			Scholar.debug("onLoad called");
+			if(hiddenBrowser.id == "scholar-hidden-browser") {
+				hiddenBrowser.removeEventListener("DOMContentLoaded", onLoad, true);
+				try {
+					var newHiddenBrowser = new Object();
+					Scholar.debug("new hidden browser");
+					newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
+					newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
+					Scholar.debug("added attributes");
+					processor(newHiddenBrowser);
+					Scholar.debug("called processor");
+				} catch (e) {
+					Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
+					exception(e);
+				}
 			}
 		};
 		var init = function() {
-			var listener;
-			listener.onStateChange = function(webProgress, request, stateFlags, status) {
-				if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 &&
-					request.name == urls[urlIndex]) {
-					try {
-						Scholar.Ingester.progressDialog.setTimeout(onLoad, 10);
-					} catch (e) {
-						exception(e);
-						Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2);
-					}
-				}
-			};
-			
-			var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
-			tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS);
+			Scholar.debug("init called");
+			hiddenBrowser.addEventListener("DOMContentLoaded", onLoad, true);
 			
 			if (firstDoc) {
+				Scholar.debug("processing");
 				processor(firstDoc, doLoad);
 			} else {
+				Scholar.debug("doing load");
 				doLoad();
 			}
 		}
 		
-		w.addEventListener("load", init, false);
+		init();
 	} catch (e) {
+		Scholar.debug("processDocuments: " + e);
 		exception(e);
-		PB_Debug.print("processDocuments: " + e);
 	}
 }
 
@@ -209,12 +213,18 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su
 // break compatibility
 Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) {
 	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
-	return author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+	author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+	return author.replace(/  +/, ' ');
 }
 
 Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
 	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
-	author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+	author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
+	author = author.replace(/  +/, ' ');
+	// Add period for initials
+	if(author.substring(author.length-2, author.length-1) == " ") {
+		author += ".";
+	}
 	var splitNames = author.split(', ');
 	if(splitNames.length > 1) {
 		author = splitNames[1]+' '+splitNames[0];
@@ -222,6 +232,16 @@ Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
 	return author;
 }
 
+Scholar.Ingester.Utilities.prototype._MARCCleanNumber = function(author) {
+	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
+	author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+	var regexp = /^[^ ]*/;
+	var m = regexp.exec(author);
+	if(m) {
+		return m[0];
+	}
+}
+
 Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
 	if(!part) {
 		part = 'a';
@@ -253,27 +273,29 @@ Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri,
 
 // This is an extension to PiggyBank's architecture. It's here so that we don't
 // need an enormous library for each scraper that wants to use MARC records
-Scholar.Ingester.Utilities.prototype.importMARCRecord = function(text, format, uri, model) {
+Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, model) {
 	var prefixDC = 'http://purl.org/dc/elements/1.1/';
 	var prefixDCMI = 'http://purl.org/dc/dcmitype/';
 	var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
 	
-	var record = new Scholar.Ingester.MARC_Record();
-	record.load(text, format);
-	
 	// Extract ISBNs
-	model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanString, 'ISBN ');
+	model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN ');
 	// Extract ISSNs
-	model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanString, 'ISBN ');
+	model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
 	// Extract creators
 	model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this._MARCCleanAuthor);
 	model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString);
 	model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString);
 	model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString);
-	if(!model.data[uri][prefixDC + 'creator']) {
+	model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this._MARCCleanAuthor);
+	model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString);
+	model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString);
+	model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString);
+	if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) {	// some LOC entries have no listed author, but have the author
+													// in the person subject field as the first entry
 		var field = record.get_field_subfields('600');
-		if(field) {
-			model = this.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a']));	
+		if(field[0]) {
+			model.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a']));	
 		}
 	}
 	// Extract title
@@ -403,12 +425,13 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
 /*
  * Constructor for Document object
  */
-Scholar.Ingester.Document = function(browserWindow){
+Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){
 	this.browser = browserWindow;
+	this.model = new Scholar.Ingester.Model();
 	this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
 	             .getService(Ci.nsIAppShellService);
-	this.scraper = null
-	this.model = new Scholar.Ingester.Model();
+	this.scraper = null;
+	this.hiddenBrowser = hiddenBrowser;
 	this._generateSandbox();
 }
 
@@ -530,11 +553,13 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
 	this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
 	this.sandbox.browser = this.browser;
 	this.sandbox.doc = this.sandbox.browser.contentDocument;
-	this.sandbox.utilities = new Scholar.Ingester.Utilities;
+	this.sandbox.utilities = new Scholar.Ingester.Utilities(this.hiddenBrowser);
 	this.sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this.appSvc.hiddenDOMWindow);
 	this.sandbox.window = this.window;
 	this.sandbox.model = this.model;
 	this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
+	this.sandbox.MARC_Record = Scholar.Ingester.MARC_Record;
+	this.sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record();
 	
 	var me = this;
 	this.sandbox.wait = function(){ me._waitForCompletion = true; };
@@ -552,50 +577,90 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
 	var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
 	
 	for(var uri in this.model.data) {
-		var newItem = Scholar.Items.getNewItemByType(1);
+		if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
+			var newItem = Scholar.Items.getNewItemByType(2);
+		} else {
+			var newItem = Scholar.Items.getNewItemByType(1);
+		}
 		newItem.setField("source", uri);
 		if(this.model.data[uri][prefixDC + 'title']) {
 			newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]);
 		}
-		if(this.model.data[uri][prefixDC + 'publisher']) {
-			newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
-		}
-		if(this.model.data[uri][prefixDC + 'year']) {
-			if(this.model.data[uri][prefixDC + 'year'].length == 4) {
-				newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
-			} else {
-				try {
-					newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring(
-							 this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1,
-							 this.model.data[uri][prefixDC + 'year'][0].length));
-				} catch(e) {}
-			}
-		}
-		if(this.model.data[uri][prefixDC + 'edition']) {
-			newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]);
-		}
-		if(this.model.data[uri][prefixDummy + 'series']) {
-			newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
-		}
-		if(this.model.data[uri][prefixDummy + 'place']) {
-			newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
-		}
-		if(this.model.data[uri][prefixDC + 'identifier']) {
-			for(i in this.model.data[uri][prefixDC + 'identifier']) {
-				if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
-					newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
-					break;
-				}
-			}
-		}
+		var creatorIndex = 0;
 		if(this.model.data[uri][prefixDC + 'creator']) {
 			for(i in this.model.data[uri][prefixDC + 'creator']) {
 				var creator = this.model.data[uri][prefixDC + 'creator'][i];
 				var spaceIndex = creator.lastIndexOf(" ");
 				var lastName = creator.substring(spaceIndex+1, creator.length);
 				var firstName = creator.substring(0, spaceIndex);
+				
+				newItem.setCreator(creatorIndex, firstName, lastName, 1);
+				creatorIndex++;
+			}
+		}
+		if(this.model.data[uri][prefixDC + 'contributor']) {
+			for(i in this.model.data[uri][prefixDC + 'contributor']) {
+				var creator = this.model.data[uri][prefixDC + 'contributor'][i];
+				var spaceIndex = creator.lastIndexOf(" ");
+				var lastName = creator.substring(spaceIndex+1, creator.length);
+				var firstName = creator.substring(0, spaceIndex);
 			
-				newItem.setCreator(i, firstName, lastName);
+				newItem.setCreator(creatorIndex, firstName, lastName, 2);
+				creatorIndex++;
+			}
+		}
+		if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
+			if(this.model.data[uri][prefixDummy + 'publication']) {
+				newItem.setField("publication", this.model.data[uri][prefixDummy + 'publication'][0]);
+			}
+			if(this.model.data[uri][prefixDummy + 'volume']) {
+				newItem.setField("volume", this.model.data[uri][prefixDummy + 'volume'][0]);
+			}
+			if(this.model.data[uri][prefixDummy + 'number']) {
+				newItem.setField("number", this.model.data[uri][prefixDummy + 'number'][0]);
+			}
+			if(this.model.data[uri][prefixDummy + 'pages']) {
+				newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages'][0]);
+			}
+			if(this.model.data[uri][prefixDC + 'identifier']) {
+				for(i in this.model.data[uri][prefixDC + 'identifier']) {
+					if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISSN') {
+						newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
+						break;
+					}
+				}
+			}
+		} else {
+			if(this.model.data[uri][prefixDC + 'publisher']) {
+				newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
+			}
+			if(this.model.data[uri][prefixDC + 'year']) {
+				if(this.model.data[uri][prefixDC + 'year'].length == 4) {
+					newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
+				} else {
+					try {
+						newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring(
+								 this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1,
+								 this.model.data[uri][prefixDC + 'year'][0].length));
+					} catch(e) {}
+				}
+			}
+			if(this.model.data[uri][prefixDC + 'edition']) {
+				newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]);
+			}
+			if(this.model.data[uri][prefixDummy + 'series']) {
+				newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
+			}
+			if(this.model.data[uri][prefixDummy + 'place']) {
+				newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
+			}
+			if(this.model.data[uri][prefixDC + 'identifier']) {
+				for(i in this.model.data[uri][prefixDC + 'identifier']) {
+					if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
+						newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
+						break;
+					}
+				}
 			}
 		}
 		newItem.save();
diff --git a/chrome/chromeFiles/content/scholar/xpcom/marc.js b/chrome/chromeFiles/content/scholar/xpcom/marc.js
@@ -80,8 +80,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s 
 			}
 			this.add_field(tag,ind1,ind2,value);
 		}
-	}
-	if (f == 'MARC_Harvard') {
+	} else if (f == 'MARC_Harvard') {
 		var linee = s.split('\n');
 		for (var i=0; i<linee.length; i++) {
 			linee[i] = this._trim(linee[i]);
@@ -128,8 +127,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s 
 			}
 		}
 		this.add_field_005();
-	}
-	if (f == 'MARC_BNI') {
+	} else if (f == 'MARC_BNI') {
 		var linee = s.split('\n');
 		for (var i=0; i<linee.length; i++) {
 			linee[i] = this._trim(linee[i]);
@@ -167,8 +165,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s 
 			}
 		}
 		this.add_field_005();
-	}
-	if (f == 'MARC_Loc') { // MARC copiato dal browser dal sito catalog.loc.gov 
+	} else if (f == 'MARC_Loc') { // MARC copiato dal browser dal sito catalog.loc.gov 
 		var linee = s.split('\n');
 		for (var i=0; i<linee.length; i++) {
 			linee[i] = this._trim(linee[i]);
@@ -209,6 +206,46 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s 
 			}
 		}
 		this.add_field_005();
+	} else if (f == 'MARC_PAC') {
+		var linee = s.split('\n');
+		for (var i=0; i<linee.length; i++) {
+			linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0
+			linee[i] = linee[i].replace(/_/g,' ');
+			linee[i] = linee[i].replace(/\t/g,'');
+			linee[i] = this._trim(linee[i]);
+			if (linee[i] == '') continue; // jumps empty lines
+			var replacer = this.subfield_delimiter+'$1';
+			linee[i]  = linee[i].replace(/\|(.)/g,replacer);
+			linee[i]  = linee[i].replace(/\|/g,this.subfield_delimiter);
+			var tag   = linee[i].substr(0,3);
+			var ind1  = linee[i].substr(4,1);
+			var ind2  = linee[i].substr(5,1);
+			var value = this.subfield_delimiter+'a'+linee[i].substr(7);
+			if(linee[i].substr(0, 6) == "LEADER") {
+				value = linee[i].substr(7);
+				this.leader.record_length = '00000';
+				this.leader.record_status = value.substr(5,1);
+				this.leader.type_of_record = value.substr(6,1);
+				this.leader.bibliographic_level = value.substr(7,1);
+				this.leader.type_of_control = value.substr(8,1);
+				this.leader.character_coding_scheme = value.substr(9,1);
+				this.leader.indicator_count = '2';
+				this.leader.subfield_code_length = '2';
+				this.leader.base_address_of_data = '00000';
+				this.leader.encoding_level = value.substr(17,1);
+				this.leader.descriptive_cataloging_form = value.substr(18,1);
+				this.leader.linked_record_requirement = value.substr(19,1);
+				this.leader.entry_map = '4500';
+				
+				this.directory = '';
+				this.directory_terminator = this.field_terminator;
+				this.variable_fields = new Array();
+			}
+			else if (tag > '008' && tag < '899') { // jumps low and high tags
+				if (tag != '040') this.add_field(tag,ind1,ind2,value);
+			}
+		}
+		this.add_field_005();
 	}
 	
 	this.update_record_length();
@@ -310,7 +347,7 @@ Scholar.Ingester.MARC_Record.prototype.exists = function(tag) { // field existen
 	return false;
 }
 
-function MARC_field(rec,tag,ind1,ind2,value) { // new MARC gield
+Scholar.Ingester.MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC gield
 	this.tag = tag;
 	this.occ = rec.count_occ(tag)+1; // occurrence order no.
 	this.ind1 = ind1; if (this.ind1 == '') this.ind1 = ' ';
@@ -428,7 +465,7 @@ Scholar.Ingester.MARC_Record.prototype.get_field_subfields = function(tag) { // 
 
 Scholar.Ingester.MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
 	if (tag.length != 3) { return false; }
-	var F = new MARC_field(this,tag,ind1,ind2,value);
+	var F = new this.MARC_field(this,tag,ind1,ind2,value);
 	// adds pointer to list of fields
 	this.variable_fields[this.variable_fields.length] = F;
 	// adds the entry to the directory
diff --git a/scrapers.sql b/scrapers.sql
@@ -0,0 +1,1014 @@
+BEGIN TRANSACTION;
+DELETE FROM scrapers;
+INSERT INTO "scrapers" VALUES(1, NULL, NULL, 20060603002000, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+	if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+	return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+var cleanString = function(s) {
+	s = utilities.trimString(s);
+	return s.replace(/ +/g, " ");
+}
+
+var uri = doc.location.href;
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
+
+// Retrieve authors
+var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+	var elmt = elmts[i];
+	
+	model.addStatement(uri, prefixDC + ''creator'', cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
+}
+
+// Retrieve data from "Product Details" box
+var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+	var elmt = elmts[i];
+	var attribute = cleanString(getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
+	if(getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
+		var value = cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
+		
+		if(attribute == "Publisher:") {
+			if(value.lastIndexOf("(") != -1) {
+				var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
+				value = value.substring(0, value.lastIndexOf("(")-1);
+			}
+			if(value.lastIndexOf(";") != -1) {
+				var edition = value.substring(value.lastIndexOf(";")+2, value.length);
+				value = value.substring(0, value.lastIndexOf(";"));
+			}
+			model.addStatement(uri, prefixDC + ''publisher'', value);
+			model.addStatement(uri, prefixDC + ''date'', date);
+			model.addStatement(uri, prefixDC + ''hasVersion'', edition);
+		} else if(attribute == "Language:") {
+			model.addStatement(uri, prefixDC + ''language'', value);
+		} else if(attribute == "ISBN:") {
+			model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
+		} else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
+			model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" ")));
+			model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));
+		}
+	}
+}
+
+var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+var title = cleanString(getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
+if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
+	title = title.substring(0, title.lastIndexOf("(")-1);
+}
+model.addStatement(uri, prefixDC + ''title'', title);');
+
+INSERT INTO "scrapers" VALUES(2, NULL, NULL, 20060603002000, 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/',
+'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {
+	return true;
+}
+return false;',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var sessionRegexp = /(?:\?|\:)sessionid=([^?:]+)(?:\?|\:|$)/;
+var numberRegexp = /(?:\?|\:)recno=([^?:]+)(?:\?|\:|$)/;
+var resultsetRegexp = /(?:\?|\:)resultset=([^?:]+)(?:\?|\:|$)/;
+var lineRegexp = /^([\w() ]+): *(.*)$/;
+var publisherRegexp = /^(.*), (.*?),?$/;
+
+var uri = doc.location.href;
+
+var sMatch = sessionRegexp.exec(uri);
+var sessionid = sMatch[1];
+
+var nMatch = numberRegexp.exec(uri);
+if(nMatch) {
+	var number = nMatch[1];
+} else {
+	number = 1;
+}
+
+var rMatch = resultsetRegexp.exec(uri);
+if(rMatch) {
+	var resultset = rMatch[1];
+} else {
+	// It''s in an XPCNativeWrapper, so we have to do this black magic
+	resultset = doc.forms.namedItem(''main'').elements.namedItem(''resultset'').value;
+}
+
+var newUri = ''http://newfirstsearch.oclc.org/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno='' + number + '':sessionid='' + sessionid + '':entitypagenum=35:0'';
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
+
+function cleanAuthor(author) {
+	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
+	author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
+	// Add period for initials
+	if(author.substring(author.length-2, author.length-1) == " ") {
+		author += ".";
+	}
+	var splitNames = author.split('', '');
+	if(splitNames.length > 1) {
+		author = splitNames[1]+'' ''+splitNames[0];
+	}
+	return author;
+}
+
+utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintext'', null, function(text) {
+	var lines = text.split(''\n'');
+	for(var i=0;i<lines.length;i++) {
+		match = lineRegexp.exec(lines[i]);
+		if(match) {
+						if(match[1] == ''Title'') {
+								var title = match[2];
+								if(!lineRegexp.test(lines[i+1])) {
+										i++;
+										title += '' ''+lines[i];
+								}
+								if(title.substring(title.length-2) == " /") {
+										title = title.substring(0, title.length-2);
+								}
+								model.addStatement(uri, prefixDC + ''title'', title);
+						} else if(match[1] == ''Author(s)'') {
+								var authors = match[2].split('';'');
+								if(authors) {
+										model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(authors[0]));
+										for(var j=1; j<authors.length; j+=2) {
+												if(authors[j-1].substring(0, 1) == ''('') {
+														j++;
+												}
+												model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(authors[j]));
+										}
+								} else {
+										model.addStatement(uri, prefixDC + ''creator'', utilities.trimString(match[2]));
+								}
+						} else if(match[1] == ''Publication'') {
+								// Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it.
+								match[2] = utilities.trimString(match[2]);
+								if(match[2].substring(match[2].length-1) == '','') {
+										match[2] = match[2].substring(0, match[2].length-1);
+								}
+								model.addStatement(uri, prefixDC + ''publisher'', match[2]);
+						} else if(match[1] == ''Language'') {
+								model.addStatement(uri, prefixDC + ''language'', utilities.trimString(match[2]));
+						} else if(match[1] == ''Standard No'') {
+								var identifiers = match[2].split(/ +/);
+								var j=0;
+								while(j<(identifiers.length-1)) {
+										var type = identifiers[j].substring(0, identifiers[j].length-1);
+										var lastChar;
+										var value;
+
+										j++;
+										while(j<identifiers.length && (lastChar = identifiers[j].substring(identifiers[j].length-1)) != '':'') {
+												if(identifiers[j].substring(0, 1) != ''('') {
+														if(lastChar == '';'') {
+																value = identifiers[j].substring(0, identifiers[j].length-1);
+														} else {
+																value = identifiers[j];
+														}
+														model.addStatement(uri, prefixDC + ''identifier'', type + '' '' + value);
+												}
+												j++;
+										}
+								}
+						} else if(match[1] == ''Year'') {
+								model.addStatement(uri, prefixDC + ''year'', match[2]);
+						}
+				}
+	}
+	
+	done();
+})
+wait();');
+
+INSERT INTO "scrapers" VALUES(3, NULL, NULL, 20060603002000, 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
+'try {
+	if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
+		return false;
+	}
+	var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
+	for(i in export_options) {
+		if(export_options[i].text == ''Latin1 MARC''
+		|| export_options[i].text == ''Raw MARC''
+		|| export_options[i].text == ''UTF-8''
+		|| export_options[i].text == ''MARC (Unicode/UTF-8)''
+		|| export_options[i].text == ''MARC (non-Unicode/MARC-8)'') {
+			return true;
+		}
+	}
+	return false;
+} catch(e) {
+	return false;
+}',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+
+var raw, unicode, latin1;
+
+var form = doc.forms.namedItem(''frm'');
+var newUri = form.action;
+var postString = '''';
+for(i in form.elements) {
+	if(form.elements[i].type == ''HIDDEN'' || form.elements[i].type == ''hidden'') {
+		postString += escape(form.elements[i].name)+''=''+escape(form.elements[i].value)+''&'';
+	}
+}
+
+var export_options = form.elements.namedItem(''RD'').options;
+for(i in export_options) {
+	if(export_options[i].text == ''Raw MARC''
+	|| export_options[i].text == ''MARC (non-Unicode/MARC-8)'') {
+		raw = i;
+	}  if(export_options[i].text == ''Latin1 MARC'') {
+		latin1 = i;
+	} else if(export_options[i].text == ''UTF-8''
+	|| export_options[i].text == ''MARC (Unicode/UTF-8)'') {
+		unicode = i;
+	}
+}
+postString += ''RD=''+i+''&MAILADDY=&SAVE=Press+to+SAVE+or+PRINT'';
+
+model.addStatement(uri, prefixRDF + ''type'', prefixDummy + ''book'', false);
+
+// No idea why this doesn''t work as post
+utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
+	var record = new MARC_Record();
+	record.load(text, "binary");
+	model = utilities.importMARCRecord(record, uri, model);
+	done();
+})
+wait();');
+
+INSERT INTO "scrapers" VALUES(4, NULL, NULL, 20060603002000, 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+	if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+	return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+var cleanString = function(s) {
+	s = utilities.trimString(s);
+	return s.replace(/ +/g, " ");
+}
+
+var uri = doc.location.href;
+
+
+// If this is a view page, find the link to the citation
+var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+if(!elmts) {
+	var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
+	var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+}
+if(!elmts) {
+	exit;
+}
+var saveCitation = getNode(doc, elmts[0], ''.'', nsResolver).href;
+var viewSavedCitations = getNode(doc, elmts[1], ''.'', nsResolver).href;
+saveCitation = saveCitation.replace(''citationAction=remove'', ''citationAction=save'');
+
+// Parse save citation link
+var importantCitationRegexp = /userID.*$/;
+var match = importantCitationRegexp.exec(viewSavedCitations);
+var postData = match[0]+''&citationAction=removeAll&confirmRemAll=on'';
+utilities.HTTPUtilities.doPost(''http://www.jstor.org/browse'', postData, null, function() {	// clear marked
+	utilities.HTTPUtilities.doGet(saveCitation, null, function() {								// mark this
+		utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse/citations.txt?exportAction=Save+as+Text+File&exportFormat=cm&''+match[0], null, function(text) {
+																								// get marked
+			var lines = text.split("\n");
+			var haveStarted = false;
+			
+			var data = new Object();
+			data[prefixDC + "title"] = new Array();
+			data[prefixDC + "creator"] = new Array();
+			data[prefixDummy + "publication"] = new Array();
+			data[prefixDummy + "volume"] = new Array();
+			data[prefixDummy + "number"] = new Array();
+			data[prefixDummy + "series"] = new Array();
+			data[prefixDC + "date"] = new Array();
+			data[prefixDummy + "pages"] = new Array();
+			data[prefixDC + "identifier"] = new Array();
+			data[prefixDC + "publisher"] = new Array();
+			
+			var stableURL;
+			
+			for(i in lines) {
+				if(haveStarted) {
+					var fieldCode = lines[i].substring(0, 2);
+					var fieldContent = cleanString(lines[i].substring(5));
+					
+					if(lines[i].substring(2, 5) != " : ") {
+						break;
+					}
+					
+					if(fieldCode == "TI") {
+						data[prefixDC + "title"].push(fieldContent);
+					} else if(fieldCode == "AU") {
+						var authors = fieldContent.split(";");
+						for(j in authors) {	
+							var author = authors[j];
+							var splitNames = author.split('', '');
+							if(splitNames) {
+								author = splitNames[1]+'' ''+splitNames[0];
+							}
+							data[prefixDC + "creator"].push(author);
+						}
+					} else if(fieldCode == "SO") {
+						data[prefixDummy + "publication"].push(fieldContent);
+					} else if(fieldCode == "VO") {
+						data[prefixDummy + "volume"].push(fieldContent);
+					} else if(fieldCode == "NO") {
+						data[prefixDummy + "number"].push(fieldContent);
+					} else if(fieldCode == "SE") {
+						data[prefixDummy + "series"].push(fieldContent);
+					} else if(fieldCode == "DA") {
+						data[prefixDC + "date"].push(fieldContent);
+					} else if(fieldCode == "PP") {
+						data[prefixDummy + "pages"].push(fieldContent);
+					} else if(fieldCode == "EI") {
+						stableURL = fieldContent;
+					} else if(fieldCode == "IN") {
+						data[prefixDC + "identifier"].push("ISSN "+fieldContent);
+					} else if(fieldCode == "PB") {
+						data[prefixDC + "publisher"].push(fieldContent);
+					}
+				}
+				if(lines[i].substring(0,3) == "<1>") {
+					haveStarted = true;
+				}
+			}
+			
+			// Loop through again so that we can add with the stableURL
+			model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journal", false);
+			for(i in data) {
+				if(data[i].length) {
+					for(j in data[i]) {
+						model.addStatement(stableURL, i, data[i][j]);
+					}
+				}
+			}
+			
+			done();
+		})
+	})
+});
+
+wait();');
+
+INSERT INTO "scrapers" VALUES(5, NULL, NULL, 20060603002000, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/journals/.+/.+/.+\.html', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+
+var month, year;
+
+var metaTags = doc.getElementsByTagName("meta");
+
+function associateMeta(field, rdfUri) {
+	var field = metaTags.namedItem(field);
+	if(field) {
+		model.addStatement(uri, rdfUri, field.getAttribute("content"), false);
+	}
+}
+
+associateMeta("Title", prefixDC + "title");
+associateMeta("Journal", prefixDummy + "publication");
+associateMeta("Volume", prefixDummy + "volume");
+associateMeta("Issue", prefixDummy + "number");
+
+var author = metaTags.namedItem("Author");
+if(author) {
+	var authors = author.getAttribute("content").split(" and ");
+	for(j in authors) {
+		model.addStatement(uri, prefixDC + "creator", authors[j], false);
+	}
+}
+
+var month = metaTags.namedItem("PublicationMonth");
+var year = metaTags.namedItem("PublicationYear");
+if(month && year) {
+	model.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false);
+}
+');
+
+INSERT INTO "scrapers" VALUES(6, NULL, NULL, 20060603002000, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/frameset\&FF=', NULL,
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+
+var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$'');
+var m = matchRegexp.exec(uri);
+var newUri = m[1]+''marc''+m[2];
+
+utilities.loadDocument(newUri, browser, function(newBrowser) {
+	newDoc = newBrowser.contentDocument;
+	
+	var namespace = newDoc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+	  if (prefix == ''x'') return namespace; else return null;
+	} : null;
+		
+	var getNode = function(doc, contextNode, xpath, nsResolver) {
+	  return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+	}
+	
+	var xpath = ''//pre'';
+	var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
+	
+	var text = getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue;
+	
+	var record = new MARC_Record();
+	record.load(text, "MARC_PAC");
+	model = utilities.importMARCRecord(record, uri, model);
+	done();
+}, function() {})
+
+wait();');
+
+INSERT INTO "scrapers" VALUES(7, NULL, NULL, 20060603002000, 'SIRSI Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
+'var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+	if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+if(elmts.length) {
+	return true;
+}
+return false;',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+	if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+  return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+function stringTrimmer(x) {
+	var x = x.replace(/^[^\w(]+/, "");
+	return x.replace(/[^\w)]+$/, "");
+}
+
+function getAnyNumber(x) {
+	var re = /[0-9]+/;
+	var m = re.exec(x);
+	if(m) {
+		return m[0];
+	}
+}
+
+function getISBN(x) {
+	var re = /^[0-9](?:[0-9X]+)/;
+	var m = re.exec(x);
+	if(m) {
+		return m[0];
+	}
+}
+
+function cleanAuthor(author) {
+	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
+	author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
+	// Add period for initials
+	if(author.substring(author.length-2, author.length-1) == " ") {
+		author += ".";
+	}
+	var splitNames = author.split('', '');
+	if(splitNames.length > 1) {
+		author = splitNames[1]+'' ''+splitNames[0];
+	}
+	return author;
+}
+
+var uri = doc.location.href;
+var data = new Object();
+
+var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+	var elmt = elmts[i];
+	try {
+		var node = getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver);
+		if(!node) {
+			var node = getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver);
+		}
+		if(node) {
+			var field = stringTrimmer(getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue);
+			field = field.toLowerCase();
+			var value = stringTrimmer(node.nodeValue);
+			var rdfUri = null;
+			if(field == "publisher") {
+				rdfUri = prefixDC + ''publisher'';
+			} else if(field == "pub date") {
+				rdfUri = prefixDC + ''date'';
+				value = getAnyNumber(value);
+			} else if(field == "isbn") {
+				rdfUri = prefixDC + ''identifier'';
+				value = ''ISBN ''+getISBN(value);
+			} else if(field == "title") {
+				rdfUri = prefixDC + ''title'';
+				var titleParts = value.split(" / ");
+				value = titleParts[0];
+			} else if(field == "publication info") {
+				rdfUri = prefixDummy + ''place'';
+				var pubParts = value.split(" : ");
+				value = pubParts[0];
+			} else if(field == "personal author") {
+				rdfUri = prefixDC + ''creator'';
+				value = cleanAuthor(node.nodeValue);
+			} else if(field == "added author") {
+				rdfUri = prefixDC + ''contributor'';
+				value = cleanAuthor(node.nodeValue);
+			} else if(field == "corporate author") {
+				rdfUri = prefixDC + ''creator'';
+			}
+			if(rdfUri) {
+				var insert = true;
+				if(data && data[rdfUri]) {
+					for(j in data[rdfUri]) {
+						if(data[rdfUri][j] == value) {
+							insert = false;
+							break;
+						}
+					}
+				} else if(!data[rdfUri]) {
+					data[rdfUri] = new Array();
+				}
+				if(insert) {
+					data[rdfUri].push(value);
+					model.addStatement(uri, rdfUri, value, true);
+				}
+			}
+		}
+	} catch (e) {}
+	
+} 
+');
+
+INSERT INTO "scrapers" VALUES(8, NULL, NULL, 20060603002000, 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+	if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+  return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+function stringTrimmer(x) {
+	var x = x.replace(/^[^\w(]+/, "");
+	return x.replace(/[^\w)]+$/, "");
+}
+
+function getPageRange(x) {
+	var re = /[0-9\-]+/;
+	var m = re.exec(x);
+	if(m) {
+		return m[0];
+	}
+}
+
+function cleanAuthor(author) {
+	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
+	author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
+	// Add period for initials
+	if(author.substring(author.length-2, author.length-1) == " ") {
+		author += ".";
+	}
+	var splitNames = author.split('', '');
+	if(splitNames.length > 1) {
+		author = splitNames[1]+'' ''+splitNames[0];
+	}
+	return author;
+}
+
+var uri = doc.location.href;
+var data = new Object();
+
+// Title
+var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="headerBlack"]/strong//text()'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+var title = "";
+for (var i = 0; i < elmts.length; i++) {
+	var elmt = elmts[i];
+	title += elmt.nodeValue;
+}
+if(title) {
+	model.addStatement(uri, prefixDC + ''title'', title, true);
+}
+
+// Authors
+var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="textMedium"]/a/em'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+	var elmt = elmts[i];
+	
+	// Dirty hack to fix highlighted words
+	var xpath = ''.//text()'';
+	var author = "";
+	var authorElmts = utilities.gatherElementsOnXPath(doc, elmt, xpath, nsResolver);
+	for (var j = 0; j < authorElmts.length; j++) {
+		var authorElmt = authorElmts[j];
+		author += authorElmt.nodeValue;
+	}
+	model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(author), true);
+}
+
+// Other info
+var xpath = ''/html/body/span[@class="textMedium"]/font/table/tbody/tr'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+	var elmt = elmts[i];
+	var field = stringTrimmer(getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase();
+	if(field == "publication title") {
+		var publication = getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver);
+		if(publication.nodeValue) {
+			model.addStatement(uri, prefixDummy + ''publication'', stringTrimmer(publication.nodeValue), true);
+		}
+		var place = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+		if(place.nodeValue) {
+			model.addStatement(uri, prefixDummy + ''place'', stringTrimmer(place.nodeValue), true);
+		}
+		var date = getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver);		
+		if(date.nodeValue) {
+			model.addStatement(uri, prefixDC + ''date'', stringTrimmer(date.nodeValue), true);
+		}
+		var moreInfo = getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver);
+		if(moreInfo.nodeValue) {
+			moreInfo = stringTrimmer(moreInfo.nodeValue);
+			var parts = moreInfo.split(";\xA0");
+			
+			var issueRegexp = /^(\w+)\.(?: |\xA0)?(.+)$/
+			var issueInfo = parts[0].split(",\xA0");
+			for(j in issueInfo) {
+				var m = issueRegexp.exec(issueInfo[j]);
+				var info = m[1].toLowerCase();
+				if(info == "vol") {
+					model.addStatement(uri, prefixDummy + ''volume'', stringTrimmer(m[2]), true);
+				} else if(info == "iss" || info == "no") {
+					model.addStatement(uri, prefixDummy + ''number'', stringTrimmer(m[2]), true);
+				}
+			}
+			if(parts[1] && stringTrimmer(parts[1]).substring(0, 3).toLowerCase() == "pg.") {
+				var pages = getPageRange(parts[1]);
+				if(pages) {
+					model.addStatement(uri, prefixDummy + ''pages'', pages, true);
+				}
+			}
+		}
+	} else if(field == "source type") {
+		var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+		if(value.nodeValue) {
+			value = stringTrimmer(value.nodeValue).toLowerCase();
+			
+			if(value == "newspaper" || value == "periodical") {
+				model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+			} else {
+				model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
+			}
+		}
+	} else if(field == "isbn" || field == "issn" || field == "issn/isbn") {
+		var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+		if(value) {
+			var type;
+			value = stringTrimmer(value.nodeValue);
+			if(value.length == 10 || value.length == 13) {
+				type = "ISBN";
+			} else if(value.length == 8) {
+				type = "ISSN";
+			}
+			if(type) {
+				model.addStatement(uri, prefixDC + "identifier", type+" "+value, false);
+			}
+		}
+	}
+}');
+
+INSERT INTO "scrapers" VALUES(9, NULL, NULL, 20060603002000, 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/',
+'if(doc.title.substring(0, 8) == "Article ") {
+	return true;
+}
+return false;',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+	if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+function cleanAuthor(author) {
+	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
+	author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
+	// Add period for initials
+	if(author.substring(author.length-2, author.length-1) == " ") {
+		author += ".";
+	}
+	var splitNames = author.split('', '');
+	if(splitNames.length > 1) {
+		author = splitNames[1]+'' ''+splitNames[0];
+	}
+	return author;
+}
+
+var uri = doc.location.href;
+
+var xpath = ''/html/body//comment()'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+	var elmt = elmts[i];
+	var colon = elmt.nodeValue.indexOf(":");
+	var field = elmt.nodeValue.substring(1, colon).toLowerCase();
+	var value = elmt.nodeValue.substring(colon+1, elmt.nodeValue.length-1);
+	if(field == "title") {
+		model.addStatement(uri, prefixDC + "title", value, false);
+	} else if(field == "journal") {
+		model.addStatement(uri, prefixDummy + "publication", value, false);
+	} else if(field == "pi") {
+		parts = value.split(" ");
+		var date = "";
+		var isDate = true;
+		var rdfUri;
+		for(j in parts) {
+			firstChar = parts[j].substring(0, 1);
+			rdfUri = false;
+			
+			if(firstChar == "v") {
+				rdfUri = prefixDummy + "volume";
+			} else if(firstChar == "i") {
+				rdfUri = prefixDummy + "issue";
+			} else if(firstChar == "p") {
+				rdfUri = prefixDummy + "pages";
+				var pagesRegexp = /p(\w+)\((\w+)\)/;
+				var match = pagesRegexp.exec(parts[j]);
+				if(match) {
+					var finalPage = parseInt(match[1])+parseInt(match[2])
+					parts[j] = "p"+match[1]+"-"+finalPage.toString();
+				}
+			}
+			
+			if(rdfUri) {
+				isDate = false;
+				if(parts[j] != "pNA") {		// not a real page number
+					var content = parts[j].substring(1);
+					model.addStatement(uri, rdfUri, content, true);
+				}
+			} else if(isDate) {
+				date += " "+parts[j];
+			}
+		}
+		if(date != "") {
+			model.addStatement(uri, prefixDC + "date", date.substring(1), false);
+		}
+	} else if(field == "author") {
+		model.addStatement(uri, prefixDC + "creator", cleanAuthor(value), false);
+	}
+}
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);');
+
+INSERT INTO "scrapers" VALUES(10, NULL, NULL, 20060603002000, 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/document', NULL,
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+function clearTags(x) {
+	x = x.replace(/<br[^>]*>/gi, "\n");
+	return x.replace(/<[^>]+>/g, "");
+}
+
+var uri = doc.location.href;
+
+var citationDataDiv;
+var divs = doc.getElementsByTagName("div");
+for(i in divs) {
+	if(divs[i].className == "bodytext") {
+		citationDataDiv = divs[i];
+		break;
+	}
+}
+
+centerElements = citationDataDiv.getElementsByTagName("center");
+var elementParts = centerElements[0].innerHTML.split(/<br[^>]*>/gi);
+model.addStatement(uri, prefixDummy + "publication", elementParts[elementParts.length-1], true);
+
+var dateRegexp = /<br[^>]*><b>([A-Z][a-z]+)<\/b> ([0-9]+, [0-9]{4})/;
+var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML);
+if(m) {
+	model.addStatement(uri, prefixDC + "date", m[1]+" "+m[2], true);
+} else {
+	var elementParts = centerElements[centerElements.length-1].innerHTML.split(/<br[^>]*>/gi);
+	model.addStatement(uri, prefixDC + "date", elementParts[1], true);
+}
+
+var cutIndex = citationDataDiv.innerHTML.indexOf("<b>BODY:</b>");
+if(cutIndex < 0) {
+	cutIndex = citationDataDiv.innerHTML.indexOf("<b>TEXT:</b>");
+}
+if(cutIndex > 0) {
+	citationData = citationDataDiv.innerHTML.substring(0, cutIndex);
+} else {
+	citationData = citationDataDiv.innerHTML;
+}
+
+citationData = clearTags(citationData);
+
+var headlineRegexp = /\n(?:HEADLINE|TITLE|ARTICLE): ([^\n]+)\n/;
+var m = headlineRegexp.exec(citationData);
+if(m) {
+	model.addStatement(uri, prefixDC + "title", clearTags(m[1]), true);
+}
+
+var bylineRegexp = /\nBYLINE:  *(\w[\w\- ]+)/;
+var m = bylineRegexp.exec(citationData);
+if(m) {
+	utilities.debugPrint(m[1].substring(0, 3).toLowerCase());
+	if(m[1].substring(0, 3).toLowerCase() == "by ") {
+		m[1] = m[1].substring(3);
+	}
+	model.addStatement(uri, prefixDC + "creator", m[1], true);
+}
+
+var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/;
+var m = authorRegexp.exec(citationData);
+if(m) {
+	var authors = m[1].split(/, (?:and )?/);
+	for(i in authors) {
+		model.addStatement(uri, prefixDC + "creator", authors[i].replace(" *", ""), true);
+	}
+}
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+
+utilities.debugPrint(citationData);');
+
+INSERT INTO "scrapers" VALUES(11, NULL, NULL, 20060603002000, 'Aleph Scraper', 'Simon Kornblith', 'func=full-set-set.*\&format=999', NULL,
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+
+var newUri = uri.replace("&format=999", "&format=001");
+utilities.debugPrint(newUri);
+
+function stringTrimmer(x) {
+	var x = x.replace(/^[^\w(]+/, "");
+	return x.replace(/[^\w)]+$/, "");
+}
+
+utilities.loadDocument(newUri, browser, function(newBrowser) {
+	newDoc = newBrowser.contentDocument;
+	
+	var namespace = newDoc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+	  if (prefix == ''x'') return namespace; else return null;
+	} : null;
+	
+	var getNode = function(doc, contextNode, xpath, nsResolver) {
+	  return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+	}
+	
+	var xpath = ''/html/body/table/tbody/tr[td[1][@class="td1"][@id="bold"]][td[2][@class="td1"]]'';
+	var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
+	var record = new MARC_Record();		
+	for(var i=0; i<elmts.length; i++) {
+		var elmt = elmts[i];
+		var field = stringTrimmer(getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue);
+		var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver).nodeValue;
+		var value = value.replace(/\|([a-z]) /g, record.subfield_delimiter+"$1");
+		
+		if(field != "FMT" && field != "LDR") {
+			var ind1 = "";
+			var ind2 = "";
+			var code = field.substring(0, 3);
+			if(field.length > 3) {
+				var ind1 = field.charAt(3);
+				if(field.length > 4) {
+					var ind2 = field.charAt(4);
+				}
+			}
+			record.add_field(code, ind1, ind2, value);
+		}
+	}
+	
+	model = utilities.importMARCRecord(record, uri, model);
+	done();
+}, function() {})
+
+wait();');
+
+
+INSERT INTO "scrapers" VALUES(12, NULL, NULL, 20060603002000, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*uri=full=[0-9]', NULL,
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+var newUri = uri+''&fullmarc=true'';
+utilities.debugPrint(newUri);
+
+function stringTrimmer(x) {
+	var x = x.replace(/^[^\w(]+/, "");
+	return x.replace(/[^\w)]+$/, "");
+}
+	
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+	return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+	
+var getNodeString = function(doc, contextNode, xpath, nsResolver) {
+	var elmts = utilities.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver);
+	var returnVar = "";
+	for(var i=0; i<elmts.length; i++) {
+		returnVar += elmts[i].nodeValue;
+	}
+	return returnVar;
+}
+
+utilities.loadDocument(newUri, browser, function(newBrowser) {
+	newDoc = newBrowser.contentDocument;
+	
+	var namespace = newDoc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+	  if (prefix == ''x'') return namespace; else return null;
+	} : null;
+	
+	var xpath = ''//form/table[@class="tableBackground"]/tbody/tr/td/table[@class="tableBackground"]/tbody/tr[td[1]/a[@class="normalBlackFont1"]]'';
+	var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
+	var record = new MARC_Record();		
+	for(var i=0; i<elmts.length; i++) {
+		var elmt = elmts[i];
+		var field = stringTrimmer(getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver).nodeValue);
+		var value = getNodeString(doc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver);
+		var value = value.replace(/\$([a-z]) /g, record.subfield_delimiter+"$1");
+		
+		if(field != "FMT" && field != "LDR") {
+			var ind1 = "";
+			var ind2 = "";
+			var valRegexp = /^([0-9])([0-9])? (.*)$/;
+			var m = valRegexp.exec(value);
+			if(m) {
+				ind1 = m[1];
+				if(ind2) {
+					ind2 = m[2]
+				}
+				value = m[3];
+			}
+			record.add_field(field, ind1, ind2, value);
+		}
+	}
+	
+	model = utilities.importMARCRecord(record, uri, model);
+	done();
+}, function() {})
+
+wait();');
+COMMIT;
+\ No newline at end of file

	www Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	chrome/chromeFiles/content/scholar/ingester/browser.js	\|	3	++-
M	chrome/chromeFiles/content/scholar/ingester/browser.xul	\|	3	+++
M	chrome/chromeFiles/content/scholar/xpcom/ingester.js	\|	217	+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
M	chrome/chromeFiles/content/scholar/xpcom/marc.js	\|	53	+++++++++++++++++++++++++++++++++++++++++++++--------
A	scrapers.sql	\|	1015	+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++