- Fix redundancy in utilities.js (I accidentally copied and pasted a much larger block of code than i meant to) - Move processDocuments, a function for loading a DOM representation of a document or set of documents, to Scholar.Utilities.HTTP - Add Scholar.Ingester.ingestURL, a simplified function to scrape a URL (closes #33) - www - Unnamed repository; edit this file 'description' to name the repository.

commit 4242c62b1b05e6248cfdf2b962c8d0eb46711588
parent 76c118e9e8ee9029a91feafafc5b834c164ea576
Author: Simon Kornblith <simon@simonster.com>
Date:   Mon, 26 Jun 2006 20:02:30 +0000

- Fix redundancy in utilities.js (I accidentally copied and pasted a much larger block of code than i meant to)
- Move processDocuments, a function for loading a DOM representation of a document or set of documents, to Scholar.Utilities.HTTP
- Add Scholar.Ingester.ingestURL, a simplified function to scrape a URL (closes #33)


Diffstat:
M chrome/chromeFiles/content/scholar/ingester/browser.js  | 2 +-
M chrome/chromeFiles/content/scholar/xpcom/ingester.js  | 51 +++++++++++++++++++++++++++++++++++++++++++++++----
M chrome/chromeFiles/content/scholar/xpcom/utilities.js  | 894 +++++++++++++++++++------------------------------------------------------------
M scrapers.sql  | 4 ++--

4 files changed, 264 insertions(+), 687 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -165,7 +165,7 @@ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) 
 
 //////////////////////////////////////////////////////////////////////////////
 //
-// Private Scholar.Ingester.Document methods
+// Private Scholar_Ingester_Interface methods
 //
 //////////////////////////////////////////////////////////////////////////////
 
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -19,6 +19,48 @@ Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) {
 	Scholar.debug("deleted hidden browser");
 }
 
+/*
+ * Operates the ingester given only a URL
+ * url - URL to scrape
+ * complete - callback function to be executed if page grab completes
+ *            (will be passed document object; obj.items contains array of
+ *            *unsaved* items scraped; empty array indicates unscrapable page)
+ * error - callback function to be executed if an error occurred loading page
+ * myWindow - optional argument indicating window to attach a dialog to. if no
+ *            window is given, Firefox Scholar uses the hidden DOM window and
+ *            will simply avoid scraping multiple pages
+ */
+Scholar.Ingester.ingestURL = function(url, complete, error, myWindow) {
+	var isHidden = false;
+	if(!myWindow) {
+		var myWindow = Components.classes["@mozilla.org/appshell/appShellService;1"]
+					   .getService(Components.interfaces.nsIAppShellService)
+					   .hiddenDOMWindow;
+		var isHidden = true;
+	}
+				   
+	var succeeded = function(browser) {
+		var myDoc = new Scholar.Ingester.Document(browser, myWindow, isHidden);
+		myDoc.retrieveScraper();
+		if(myDoc.scraper) {
+			myDoc.scrapePage(function(myDoc) {
+				Scholar.Ingester.deleteHiddenBrowser(browser);
+				complete(myDoc);
+			});
+		} else {
+			Scholar.Ingester.deleteHiddenBrowser(browser);
+			complete(myDoc);
+		}
+	}
+	
+	var failed = function() {
+		Scholar.debug("Scholar.Ingester.ingestURL: could not ingest "+url);
+		error();
+	}
+	
+	Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed, true);
+}
+
 /////////////////////////////////////////////////////////////////
 //
 // Scholar.Ingester.ProxyMonitor
@@ -195,10 +237,11 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
 /*
  * Constructor for Document object
  */
-Scholar.Ingester.Document = function(browserWindow, myWindow){
-	this.scraper = this.type = null;
-	this.browser = browserWindow;
+Scholar.Ingester.Document = function(myBrowser, myWindow, isHidden) {
+	this.browser = myBrowser;
 	this.window = myWindow;
+	this.isHidden = isHidden;
+	this.scraper = this.type = null;
 	this.model = new Scholar.Ingester.Model();
 	
 	// Create separate URL to account for proxies
@@ -349,7 +392,7 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
 	this._sandbox.browser = this.browser;
 	this._sandbox.doc = this.browser.contentDocument;
 	this._sandbox.url = this.url;
-	this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL);
+	this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL, this.isHidden);
 	this._sandbox.utilities.HTTPUtilities = new Scholar.Utilities.Ingester.HTTPUtilities(this.proxiedURL);
 	this._sandbox.window = this.window;
 	this._sandbox.model = this.model;
diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js
@@ -133,9 +133,10 @@ Scholar.Utilities.prototype.cleanTags = function(x) {
 // Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional
 // classes relating to data extraction specifically from HTML documents.
 
-Scholar.Utilities.Ingester = function(myWindow, proxiedURL) {
+Scholar.Utilities.Ingester = function(myWindow, proxiedURL, isHidden) {
 	this.window = myWindow;
 	this.proxiedURL = proxiedURL;
+	this.isHidden = isHidden;
 }
 
 Scholar.Utilities.Ingester.prototype = new Scholar.Utilities();
@@ -154,680 +155,8 @@ Scholar.Utilities.Ingester.prototype.gatherElementsOnXPath = function(doc, paren
 	return elmts;
 }
 
-// Loads a single document for a scraper, running succeeded() on success or
-// failed() on failure
-Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) {
-	Scholar.debug("loadDocument called");
-	this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
-}
-
-// Downloads and processes documents with processor()
-// browser - a browser object
-// firstDoc - the first document to process with the processor (if null, 
-//            first document is processed without processor)
-// urls - an array of URLs to load
-// processor - a function to execute to process each document
-// done - a function to execute when all document processing is complete
-// exception - a function to execute if an exception occurs (exceptions are
-//             also logged in the Scholar for Firefox log)
-Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
-	var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
-	var myWindow = this.window;
-	var prevUrl, url;
-	Scholar.debug("processDocuments called");
-	
-	try {
-		if (urls.length == 0) {
-			if(firstDoc) {
-				processor(firstDoc, done);
-			} else {
-				done();
-			}
-			return;
-		}
-		
-		var urlIndex = -1;
-		var doLoad = function() {
-			urlIndex++;
-			if (urlIndex < urls.length) {
-				url = urls[urlIndex];
-				if(this.proxiedURL) {
-					url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
-				}
-				try {
-					Scholar.debug("loading "+url);
-					hiddenBrowser.loadURI(url);
-				} catch (e) {
-					Scholar.debug("Scholar.Utilities.Ingester.processDocuments doLoad: " + e, 2);
-					exception(e);
-				}
-			} else {
-				hiddenBrowser.removeEventListener("load", onLoad, true);
-				Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
-				done();
-			}
-		};
-		var onLoad = function() {
-			Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
-			if(hiddenBrowser.contentDocument.location.href != prevUrl) {	// Just in case it fires too many times
-				prevUrl = hiddenBrowser.contentDocument.location.href;
-				try {
-					var newHiddenBrowser = new Object();
-					newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
-					newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
-					processor(newHiddenBrowser);
-				} catch (e) {
-					Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2);
-					exception(e);
-				}
-				doLoad();
-			}
-		};
-		var init = function() {
-			Scholar.debug("init called");
-			hiddenBrowser.addEventListener("load", onLoad, true);
-			
-			if (firstDoc) {
-				Scholar.debug("processing");
-				processor(firstDoc, doLoad);
-			} else {
-				Scholar.debug("doing load");
-				doLoad();
-			}
-		}
-		
-		init();
-	} catch (e) {
-		Scholar.debug("processDocuments: " + e);
-		exception(e);
-	}
-}
-
-// Appears to look for links in a document containing a certain substring (kind
-// of like getItemArray, only with NO REGEXP FUNCTIONALITY)
-Scholar.Utilities.Ingester.prototype.collectURLsWithSubstring = function(doc, substring) {
-	var urls = [];
-	var addedURLs = [];
-	
-	var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
-	var aElement = aElements.iterateNext();
-	while (aElement) {
-		var href = aElement.href;
-		if (href.indexOf(substring) >= 0 && !(addedURLs[href])) {
-			urls.unshift(href);
-			addedURLs[href] = true;
-		}
-		aElement = aElements.iterateNext();
-	}
-	return urls;
-}
-
-// For now, we're going to skip the getLLsFromAddresses function (which gets
-// latitude and longitude pairs from a series of addresses, but requires the
-// big mess of Java code that is the Piggy Bank server) and the geoHelper
-// tools (which rely on getLLsFromAddresses) since these are probably not
-// essential components for Scholar and would take a great deal of effort to
-// implement. We can, however, always implement them later.
-
-/*
- * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS
- */
-
-/*
- * Gets a given node (assumes only one value)
- */
-Scholar.Utilities.Ingester.prototype.getNode = function(doc, contextNode, xpath, nsResolver) {
-	return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext();
-}
-
-/*
- * Gets a given node as a string containing all child nodes
- */
-Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) {
-	var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver);
-	var returnVar = "";
-	for(var i=0; i<elmts.length; i++) {
-		returnVar += elmts[i].nodeValue;
-	}
-	return returnVar;
-}
-
-/*
- * Allows a user to select which items to scrape
- */
-Scholar.Utilities.Ingester.prototype.selectItems = function(itemList) {
-	// mozillazine made me do it! honest!
-	var io = { dataIn:itemList, dataOut:null }
-	var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
-		"_blank","chrome,modal,centerscreen,resizable=yes", io);
-	return io.dataOut;
-}
-
-/*
- * Grabs items based on URLs
- */
-Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe, rejectRe) {
-	var availableItems = new Object();	// Technically, associative arrays are objects
-	
-	// Require link to match this
-	if(urlRe) {
-		var urlRegexp = new RegExp();
-		urlRegexp.compile(urlRe, "i");
-	}
-	// Do not allow text to match this
-	if(rejectRe) {
-		var rejectRegexp = new RegExp();
-		rejectRegexp.compile(rejectRe, "i");
-	}
-	
-	if(!inHere.length) {
-		inHere = new Array(inHere);
-	}
-	
-	for(var j=0; j<inHere.length; j++) {
-		var links = inHere[j].getElementsByTagName("a");
-		for(var i=0; i<links.length; i++) {
-			if(!urlRe || urlRegexp.test(links[i].href)) {
-				var text = this.getNodeString(doc, links[i], './/text()', null);
-				if(text) {
-					text = this.cleanString(text);
-					if(!rejectRe || !rejectRegexp.test(text)) {
-						if(availableItems[links[i].href]) {
-							if(text != availableItems[links[i].href]) {
-								availableItems[links[i].href] += " "+text;
-							}
-						} else {
-							availableItems[links[i].href] = text;
-						}
-					}
-				}
-			}
-		}
-	}
-	
-	return availableItems;
-}
-
-/*
- * Handles OAI-PMH requests
- */
-Scholar.Utilities.Ingester.prototype.importOAIPMH = function(uri, model) {
-	
-}
-
-// These functions are for use by importMARCRecord. They're private, because,
-// while they are useful, it's also nice if as many of our scrapers as possible
-// are PiggyBank compatible, and if our scrapers used functions, that would
-// break compatibility
-Scholar.Utilities.Ingester.prototype._MARCCleanString = function(author) {
-	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
-	author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
-	return author.replace(/  +/, ' ');
-}
-
-Scholar.Utilities.Ingester.prototype._MARCCleanNumber = function(author) {
-	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
-	author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
-	var regexp = /^[^ ]*/;
-	var m = regexp.exec(author);
-	if(m) {
-		return m[0];
-	}
-}
-Scholar.Utilities.Ingester.prototype._MARCPullYear = function(text) {
-	var pullRe = /[0-9]+/;
-	var m = pullRe.exec(text);
-	if(m) {
-		return m[0];
-	}
-}
-
-Scholar.Utilities.Ingester.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
-	if(!part) {
-		part = 'a';
-	}
-	var field = record.get_field_subfields(fieldNo);
-	Scholar.debug('Found '+field.length+' matches for '+fieldNo+part);
-	if(field) {
-		for(i in field) {
-			var value;
-			for(var j=0; j<part.length; j++) {
-				var myPart = part.substr(j, 1);
-				if(field[i][myPart]) {
-					if(value) {
-						value += " "+field[i][myPart];
-					} else {
-						value = field[i][myPart];
-					}
-				}
-			}
-			if(value) {
-				if(execMe) {
-					value = execMe(value);
-				}
-				if(prefix) {
-					value = prefix + value;
-				}
-				model.addStatement(uri, rdfUri, value);
-			}
-		}
-	}
-	return model;
-}
-
-// This is an extension to PiggyBank's architecture. It's here so that we don't
-// need an enormous library for each scraper that wants to use MARC records
-Scholar.Utilities.Ingester.prototype.importMARCRecord = function(record, uri, model) {
-	var prefixDC = 'http://purl.org/dc/elements/1.1/';
-	var prefixDCMI = 'http://purl.org/dc/dcmitype/';
-	var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
-	var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
-	
-	// Extract ISBNs
-	model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN ');
-	// Extract ISSNs
-	model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
-	// Extract creators
-	model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
-	model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString);
-	model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString);
-	model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
-	model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString);
-	model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString);
-	if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) {
-		// some LOC entries have no listed author, but have the author in the person subject field as the first entry
-		var field = record.get_field_subfields('600');
-		if(field[0]) {
-			model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));	
-		}
-	}
-	// Extract title
-	model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString, '', 'ab');
-	// Extract edition
-	model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString);
-	// Extract place info
-	model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a');
-	// Extract publisher info
-	model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b');
-	// Extract year
-	model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCPullYear, '', 'c');
-	// Extract series
-	model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString);
-	// Extract call number
-	model = this._MARCAssociateField(record, uri, model, '050', prefixDC + 'identifier', this._MARCCleanString, 'LCC ', 'ab');
-	model = this._MARCAssociateField(record, uri, model, '060', prefixDC + 'identifier', this._MARCCleanString, 'NLM ', 'ab');
-	model = this._MARCAssociateField(record, uri, model, '070', prefixDC + 'identifier', this._MARCCleanString, 'NAL ', 'ab');
-	model = this._MARCAssociateField(record, uri, model, '080', prefixDC + 'identifier', this._MARCCleanString, 'UDC ', 'ab');
-	model = this._MARCAssociateField(record, uri, model, '082', prefixDC + 'identifier', this._MARCCleanString, 'DDC ', 'a');
-	model = this._MARCAssociateField(record, uri, model, '084', prefixDC + 'identifier', this._MARCCleanString, 'CN ', 'ab');
-	
-	// Set type
-	model = model.addStatement(uri, prefixRDF + 'type', prefixDummy + "book", true);
-}
-
-/*
- * END SCHOLAR FOR FIREFOX EXTENSIONS
- */
-
-// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
-// accessed outside the sandbox, and even if it could, it wouldn't let scripts
-// access across domains, so everything's replicated here.
-// Scholar for Firefox Utilities
-// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
-// This code is licensed according to the GPL
-
-/////////////////////////////////////////////////////////////////
-//
-// Scholar.Utilities
-//
-/////////////////////////////////////////////////////////////////
-// Scholar.Utilities class, a set of methods to assist in data
-// extraction. Some of the code here was stolen directly from the Piggy Bank
-// project.
-
-Scholar.Utilities = function () {}
-
-// Adapter for Piggy Bank function to print debug messages; log level is
-// fixed at 4 (could change this)
-Scholar.Utilities.prototype.debugPrint = function(msg) {
-	Scholar.debug(msg, 4);
-}
-
-// Appears to trim a string, chopping of newlines/spacing
-Scholar.Utilities.prototype.trimString = function(s) {
-	var i = 0;
-	var spaceChars = " \n\r\t" + String.fromCharCode(160) /* &nbsp; */;
-	while (i < s.length) {
-		var c = s.charAt(i);
-		if (spaceChars.indexOf(c) < 0) {
-			break;
-		}
-		i++;
-	}
-	
-	s = s.substring(i);
-	
-	i = s.length;
-	while (i > 0) {
-		var c = s.charAt(i - 1);
-		if (spaceChars.indexOf(c) < 0) {
-			break;
-		}
-		i--;
-	}
-	
-	return s.substring(0, i);
-}
-
-/*
- * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS
- * Functions below this point are extensions to the utilities provided by
- * Piggy Bank. When used in external code, the repository will need to add
- * a function definition when exporting in Piggy Bank format.
- */
-
-/*
- * Converts a JavaScript date object to an ISO-style date
- */
-Scholar.Utilities.prototype.dateToISO = function(jsDate) {
-	var date = "";
-	var year = jsDate.getFullYear().toString();
-	var month = (jsDate.getMonth()+1).toString();
-	var day = jsDate.getDate().toString();
-	
-	for(var i = year.length; i<4; i++) {
-		date += "0";
-	}
-	date += year+"-";
-	
-	if(month.length == 1) {
-		date += "0";
-	}
-	date += month+"-";
-	
-	if(day.length == 1) {
-		date += "0";
-	}
-	date += day;
-	
-	return date;
-}
-
-/*
- * Cleans extraneous punctuation off an author name
- */
-Scholar.Utilities.prototype.cleanAuthor = function(author) {
-	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
-	author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
-	author = author.replace(/  +/, ' ');
-	// Add period for initials
-	if(author.substring(author.length-2, author.length-1) == " ") {
-		author += ".";
-	}
-	var splitNames = author.split(', ');
-	if(splitNames.length > 1) {
-		author = splitNames[1]+' '+splitNames[0];
-	}
-	return author;
-}
-
-/*
- * Cleans whitespace off a string and replaces multiple spaces with one
- */
-Scholar.Utilities.prototype.cleanString = function(s) {
-	s = s.replace(/[ \xA0]+/g, " ");
-	return this.trimString(s);
-}
-
-/*
- * Cleans any non-word non-parenthesis characters off the ends of a string
- */
-Scholar.Utilities.prototype.superCleanString = function(x) {
-	var x = x.replace(/^[^\w(]+/, "");
-	return x.replace(/[^\w)]+$/, "");
-}
-
-/*
- * Eliminates HTML tags, replacing <br>s with /ns
- */
-Scholar.Utilities.prototype.cleanTags = function(x) {
-	x = x.replace(/<br[^>]*>/gi, "\n");
-	return x.replace(/<[^>]+>/g, "");
-}
-
-// These functions are for use by importMARCRecord. They're private, because,
-// while they are useful, it's also nice if as many of our scrapers as possible
-// are PiggyBank compatible, and if our scrapers used functions, that would
-// break compatibility
-Scholar.Utilities.prototype._MARCCleanString = function(author) {
-	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
-	author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
-	return author.replace(/  +/, ' ');
-}
-
-Scholar.Utilities.prototype._MARCCleanNumber = function(author) {
-	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
-	author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
-	var regexp = /^[^ ]*/;
-	var m = regexp.exec(author);
-	if(m) {
-		return m[0];
-	}
-}
-Scholar.Utilities.prototype._MARCPullYear = function(text) {
-	var pullRe = /[0-9]+/;
-	var m = pullRe.exec(text);
-	if(m) {
-		return m[0];
-	}
-}
-
-Scholar.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
-	if(!part) {
-		part = 'a';
-	}
-	var field = record.get_field_subfields(fieldNo);
-	Scholar.debug('Found '+field.length+' matches for '+fieldNo+part);
-	if(field) {
-		for(i in field) {
-			var value;
-			for(var j=0; j<part.length; j++) {
-				var myPart = part.substr(j, 1);
-				if(field[i][myPart]) {
-					if(value) {
-						value += " "+field[i][myPart];
-					} else {
-						value = field[i][myPart];
-					}
-				}
-			}
-			if(value) {
-				if(execMe) {
-					value = execMe(value);
-				}
-				if(prefix) {
-					value = prefix + value;
-				}
-				model.addStatement(uri, rdfUri, value);
-			}
-		}
-	}
-	return model;
-}
-
-// This is an extension to PiggyBank's architecture. It's here so that we don't
-// need an enormous library for each scraper that wants to use MARC records
-Scholar.Utilities.prototype.importMARCRecord = function(record, uri, model) {
-	var prefixDC = 'http://purl.org/dc/elements/1.1/';
-	var prefixDCMI = 'http://purl.org/dc/dcmitype/';
-	var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
-	var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
-	
-	// Extract ISBNs
-	model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN ');
-	// Extract ISSNs
-	model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
-	// Extract creators
-	model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
-	model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString);
-	model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString);
-	model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
-	model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString);
-	model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString);
-	if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) {
-		// some LOC entries have no listed author, but have the author in the person subject field as the first entry
-		var field = record.get_field_subfields('600');
-		if(field[0]) {
-			model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));	
-		}
-	}
-	// Extract title
-	model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString, '', 'ab');
-	// Extract edition
-	model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString);
-	// Extract place info
-	model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a');
-	// Extract publisher info
-	model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b');
-	// Extract year
-	model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCPullYear, '', 'c');
-	// Extract series
-	model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString);
-	// Extract call number
-	model = this._MARCAssociateField(record, uri, model, '050', prefixDC + 'identifier', this._MARCCleanString, 'LCC ', 'ab');
-	model = this._MARCAssociateField(record, uri, model, '060', prefixDC + 'identifier', this._MARCCleanString, 'NLM ', 'ab');
-	model = this._MARCAssociateField(record, uri, model, '070', prefixDC + 'identifier', this._MARCCleanString, 'NAL ', 'ab');
-	model = this._MARCAssociateField(record, uri, model, '080', prefixDC + 'identifier', this._MARCCleanString, 'UDC ', 'ab');
-	model = this._MARCAssociateField(record, uri, model, '082', prefixDC + 'identifier', this._MARCCleanString, 'DDC ', 'a');
-	model = this._MARCAssociateField(record, uri, model, '084', prefixDC + 'identifier', this._MARCCleanString, 'CN ', 'ab');
-	
-	// Set type
-	model = model.addStatement(uri, prefixRDF + 'type', prefixDummy + "book", true);
-}
-
-/*
- * END SCHOLAR FOR FIREFOX EXTENSIONS
- */
-
-/////////////////////////////////////////////////////////////////
-//
-// Scholar.Utilities.Ingester
-//
-/////////////////////////////////////////////////////////////////
-// Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional
-// classes relating to data extraction specifically from HTML documents.
-
-Scholar.Utilities.Ingester = function(myWindow, proxiedURL) {
-	this.window = myWindow;
-	this.proxiedURL = proxiedURL;
-}
-
-Scholar.Utilities.Ingester.prototype = new Scholar.Utilities();
-
-// Takes an XPath query and returns the results
-Scholar.Utilities.Ingester.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) {
-	var elmts = [];
-	
-	var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
-	var elmt = iterator.iterateNext();
-	var i = 0;
-	while (elmt) {
-		elmts[i++] = elmt;
-		elmt = iterator.iterateNext();
-	}
-	return elmts;
-}
-
-// Loads a single document for a scraper, running succeeded() on success or
-// failed() on failure
-Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) {
-	Scholar.debug("loadDocument called");
-	this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
-}
-
-// Downloads and processes documents with processor()
-// browser - a browser object
-// firstDoc - the first document to process with the processor (if null, 
-//            first document is processed without processor)
-// urls - an array of URLs to load
-// processor - a function to execute to process each document
-// done - a function to execute when all document processing is complete
-// exception - a function to execute if an exception occurs (exceptions are
-//             also logged in the Scholar for Firefox log)
-Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
-	var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
-	var myWindow = this.window;
-	var prevUrl, url;
-	Scholar.debug("processDocuments called");
-	
-	try {
-		if (urls.length == 0) {
-			if(firstDoc) {
-				processor(firstDoc, done);
-			} else {
-				done();
-			}
-			return;
-		}
-		
-		var urlIndex = -1;
-		var doLoad = function() {
-			urlIndex++;
-			if (urlIndex < urls.length) {
-				url = urls[urlIndex];
-				if(this.proxiedURL) {
-					url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
-				}
-				try {
-					Scholar.debug("loading "+url);
-					hiddenBrowser.loadURI(url);
-				} catch (e) {
-					Scholar.debug("Scholar.Utilities.Ingester.processDocuments doLoad: " + e, 2);
-					exception(e);
-				}
-			} else {
-				hiddenBrowser.removeEventListener("load", onLoad, true);
-				Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
-				done();
-			}
-		};
-		var onLoad = function() {
-			Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
-			if(hiddenBrowser.contentDocument.location.href != prevUrl) {	// Just in case it fires too many times
-				prevUrl = hiddenBrowser.contentDocument.location.href;
-				try {
-					var newHiddenBrowser = new Object();
-					newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
-					newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
-					processor(newHiddenBrowser);
-				} catch (e) {
-					Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2);
-					exception(e);
-				}
-				doLoad();
-			}
-		};
-		var init = function() {
-			Scholar.debug("init called");
-			hiddenBrowser.addEventListener("load", onLoad, true);
-			
-			if (firstDoc) {
-				Scholar.debug("processing");
-				processor(firstDoc, doLoad);
-			} else {
-				Scholar.debug("doing load");
-				doLoad();
-			}
-		}
-		
-		init();
-	} catch (e) {
-		Scholar.debug("processDocuments: " + e);
-		exception(e);
-	}
-}
-
-// Appears to look for links in a document containing a certain substring
+// Appears to look for links in a document containing a certain substring (kind
+// of like getItemArray, only with NO REGEXP FUNCTIONALITY)
 Scholar.Utilities.Ingester.prototype.collectURLsWithSubstring = function(doc, substring) {
 	var urls = [];
 	var addedURLs = [];
@@ -879,11 +208,15 @@ Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode, 
  * Allows a user to select which items to scrape
  */
 Scholar.Utilities.Ingester.prototype.selectItems = function(itemList) {
-	// mozillazine made me do it! honest!
-	var io = { dataIn:itemList, dataOut:null }
-	var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
-		"_blank","chrome,modal,centerscreen,resizable=yes", io);
-	return io.dataOut;
+	if(this.isHidden != true) {
+		// this is kinda ugly, mozillazine made me do it! honest!
+		var io = { dataIn:itemList, dataOut:null }
+		var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
+			"_blank","chrome,modal,centerscreen,resizable=yes", io);
+		return io.dataOut;
+	} else {
+		return null;
+	}
 }
 
 /*
@@ -931,12 +264,129 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe,
 	return availableItems;
 }
 
+// These functions are for use by importMARCRecord. They're private, because,
+// while they are useful, it's also nice if as many of our scrapers as possible
+// are PiggyBank compatible, and if our scrapers used functions, that would
+// break compatibility
+Scholar.Utilities.Ingester.prototype._MARCCleanString = function(author) {
+	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
+	author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+	return author.replace(/  +/, ' ');
+}
+
+Scholar.Utilities.Ingester.prototype._MARCCleanNumber = function(author) {
+	author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
+	author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+	var regexp = /^[^ ]*/;
+	var m = regexp.exec(author);
+	if(m) {
+		return m[0];
+	}
+}
+Scholar.Utilities.Ingester.prototype._MARCPullYear = function(text) {
+	var pullRe = /[0-9]+/;
+	var m = pullRe.exec(text);
+	if(m) {
+		return m[0];
+	}
+}
+
+Scholar.Utilities.Ingester.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
+	if(!part) {
+		part = 'a';
+	}
+	var field = record.get_field_subfields(fieldNo);
+	Scholar.debug('Found '+field.length+' matches for '+fieldNo+part);
+	if(field) {
+		for(i in field) {
+			var value;
+			for(var j=0; j<part.length; j++) {
+				var myPart = part.substr(j, 1);
+				if(field[i][myPart]) {
+					if(value) {
+						value += " "+field[i][myPart];
+					} else {
+						value = field[i][myPart];
+					}
+				}
+			}
+			if(value) {
+				if(execMe) {
+					value = execMe(value);
+				}
+				if(prefix) {
+					value = prefix + value;
+				}
+				model.addStatement(uri, rdfUri, value);
+			}
+		}
+	}
+	return model;
+}
+
+// This is an extension to PiggyBank's architecture. It's here so that we don't
+// need an enormous library for each scraper that wants to use MARC records
+Scholar.Utilities.Ingester.prototype.importMARCRecord = function(record, uri, model) {
+	var prefixDC = 'http://purl.org/dc/elements/1.1/';
+	var prefixDCMI = 'http://purl.org/dc/dcmitype/';
+	var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
+	var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
+	
+	// Extract ISBNs
+	model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN ');
+	// Extract ISSNs
+	model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
+	// Extract creators
+	model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
+	model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString);
+	model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString);
+	model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
+	model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString);
+	model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString);
+	if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) {
+		// some LOC entries have no listed author, but have the author in the person subject field as the first entry
+		var field = record.get_field_subfields('600');
+		if(field[0]) {
+			model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));	
+		}
+	}
+	// Extract title
+	model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString, '', 'ab');
+	// Extract edition
+	model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString);
+	// Extract place info
+	model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a');
+	// Extract publisher info
+	model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b');
+	// Extract year
+	model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCPullYear, '', 'c');
+	// Extract series
+	model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString);
+	// Extract call number
+	model = this._MARCAssociateField(record, uri, model, '050', prefixDC + 'identifier', this._MARCCleanString, 'LCC ', 'ab');
+	model = this._MARCAssociateField(record, uri, model, '060', prefixDC + 'identifier', this._MARCCleanString, 'NLM ', 'ab');
+	model = this._MARCAssociateField(record, uri, model, '070', prefixDC + 'identifier', this._MARCCleanString, 'NAL ', 'ab');
+	model = this._MARCAssociateField(record, uri, model, '080', prefixDC + 'identifier', this._MARCCleanString, 'UDC ', 'ab');
+	model = this._MARCAssociateField(record, uri, model, '082', prefixDC + 'identifier', this._MARCCleanString, 'DDC ', 'a');
+	model = this._MARCAssociateField(record, uri, model, '084', prefixDC + 'identifier', this._MARCCleanString, 'CN ', 'ab');
+	
+	// Set type
+	model = model.addStatement(uri, prefixRDF + 'type', prefixDummy + "book", true);
+}
+
 /*
  * END SCHOLAR FOR FIREFOX EXTENSIONS
  */
 
 // Ingester adapters for Scholar.Utilities.HTTP to handle proxies
 
+Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) {
+	Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed);
+}
+Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
+	Scholar.Utilities.HTTP.processDocuments(firstDoc, urls, processor, done, exception);
+}
+
 Scholar.Utilities.Ingester.HTTPUtilities = function(proxiedURL) {
 	this.proxiedURL = proxiedURL
 }
@@ -1131,4 +581,88 @@ Scholar.Utilities.HTTP = new function() {
 			break;
 		}
 	}
+}
+
+// Downloads and processes documents with processor()
+// firstDoc - the first document to process with the processor (if null, 
+//            first document is processed without processor)
+// urls - an array of URLs to load
+// processor - a function to execute to process each document
+// done - a function to execute when all document processing is complete
+// exception - a function to execute if an exception occurs (exceptions are
+//             also logged in the Scholar for Firefox log)
+// saveBrowser - whether to save the hidden browser object; usually, you don't
+//               want to do this, because it makes it easier to leak memory
+Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, done, exception, saveBrowser) {
+	var myWindow = Components.classes["@mozilla.org/appshell/appShellService;1"]
+				   .getService(Components.interfaces.nsIAppShellService)
+				   .hiddenDOMWindow;
+	var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(myWindow);
+	var prevUrl, url;
+	Scholar.debug("processDocuments called");
+	
+	try {
+		if (urls.length == 0) {
+			if(firstDoc) {
+				processor(firstDoc, done);
+			} else {
+				done();
+			}
+			return;
+		}
+		
+		var urlIndex = -1;
+		var doLoad = function() {
+			urlIndex++;
+			if (urlIndex < urls.length) {
+				url = urls[urlIndex];
+				try {
+					Scholar.debug("loading "+url);
+					hiddenBrowser.loadURI(url);
+				} catch (e) {
+					Scholar.debug("Scholar.Utilities.Ingester.processDocuments doLoad: " + e, 2);
+					exception(e);
+				}
+			} else {
+				hiddenBrowser.removeEventListener("load", onLoad, true);
+				if(!saveBrowser) {
+					Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
+				}
+				done();
+			}
+		};
+		var onLoad = function() {
+			Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
+			if(hiddenBrowser.contentDocument.location.href != prevUrl) {	// Just in case it fires too many times
+				prevUrl = hiddenBrowser.contentDocument.location.href;
+				try {
+					var newHiddenBrowser = new Object();
+					newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
+					newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
+					processor(newHiddenBrowser);
+				} catch (e) {
+					Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2);
+					exception(e);
+				}
+				doLoad();
+			}
+		};
+		var init = function() {
+			Scholar.debug("init called");
+			hiddenBrowser.addEventListener("load", onLoad, true);
+			
+			if (firstDoc) {
+				Scholar.debug("processing");
+				processor(firstDoc, doLoad);
+			} else {
+				Scholar.debug("doing load");
+				doLoad();
+			}
+		}
+		
+		init();
+	} catch (e) {
+		Scholar.debug("processDocuments: " + e);
+		exception(e);
+	}
 }
 \ No newline at end of file
diff --git a/scrapers.sql b/scrapers.sql
@@ -790,8 +790,8 @@ if(newUri) {
 		return true;
 	}
 	
-	var urlRe = new RegExp("^(http://[^/]+(/search/[^/]+/))([^\?]*)");
-	var m = urlRe.exec(uri);
+	var urlRe = new RegExp("^(http://[^/]+(/search/[^/]+/))");
+	var m = urlRe.exec(urls[0]);
 	var clearUrl = m[0]+"?clear_saves=1";
 	var postUrl = m[0];
 	var exportUrl = m[1]+"++export/1,-1,-1,B/export";

	www Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

M	chrome/chromeFiles/content/scholar/ingester/browser.js	\|	2	+-
M	chrome/chromeFiles/content/scholar/xpcom/ingester.js	\|	51	+++++++++++++++++++++++++++++++++++++++++++++++----
M	chrome/chromeFiles/content/scholar/xpcom/utilities.js	\|	894	+++++++++++++++++++------------------------------------------------------------
M	scrapers.sql	\|	4	++--