commit 714885295539dfb4e47576775b2a85a93eaa91c2
parent 1096a95f62fa477deacf25f033b5ec89acbad250
Author: Simon Kornblith <simon@simonster.com>
Date: Mon, 26 Jun 2006 14:46:57 +0000
make generic Scholar.Utilities class and HTTP-dependent Scholar.Utilities.Ingester and Scholar.Utilities.HTTP classes in preparation for import/export filters; split off into separate javascript file
Diffstat:
3 files changed, 573 insertions(+), 543 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -2,7 +2,7 @@
// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
// This code is licensed according to the GPL
-Scholar.Ingester = new function() {}
+Scholar.Ingester = new Object();
Scholar.Ingester.createHiddenBrowser = function(myWindow) {
// Create a hidden browser
@@ -165,545 +165,6 @@ Scholar.Ingester.Model.prototype.addTag = function() {}
Scholar.Ingester.Model.prototype.getRepository = function() {}
Scholar.Ingester.Model.prototype.detachRepository = function() {}
-/////////////////////////////////////////////////////////////////
-//
-// Scholar.Ingester.Utilities
-//
-/////////////////////////////////////////////////////////////////
-// Scholar.Ingester.Utilities class, a set of methods to assist in data
-// extraction. Most code here was stolen directly from the Piggy Bank project.
-Scholar.Ingester.Utilities = function(myWindow, proxiedURL) {
- this.window = myWindow;
- this.proxiedURL = proxiedURL;
-}
-
-// Adapter for Piggy Bank function to print debug messages; log level is
-// fixed at 4 (could change this)
-Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) {
- Scholar.debug(msg, 4);
-}
-
-// Appears to trim a string, chopping of newlines/spacing
-Scholar.Ingester.Utilities.prototype.trimString = function(s) {
- var i = 0;
- var spaceChars = " \n\r\t" + String.fromCharCode(160) /* */;
- while (i < s.length) {
- var c = s.charAt(i);
- if (spaceChars.indexOf(c) < 0) {
- break;
- }
- i++;
- }
-
- s = s.substring(i);
-
- i = s.length;
- while (i > 0) {
- var c = s.charAt(i - 1);
- if (spaceChars.indexOf(c) < 0) {
- break;
- }
- i--;
- }
-
- return s.substring(0, i);
-}
-
-// Takes an XPath query and returns the results
-Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) {
- var elmts = [];
-
- var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
- var elmt = iterator.iterateNext();
- var i = 0;
- while (elmt) {
- elmts[i++] = elmt;
- elmt = iterator.iterateNext();
- }
- return elmts;
-}
-
-// Loads a single document for a scraper, running succeeded() on success or
-// failed() on failure
-Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {
- Scholar.debug("loadDocument called");
- this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
-}
-
-// Downloads and processes documents with processor()
-// browser - a browser object
-// firstDoc - the first document to process with the processor (if null,
-// first document is processed without processor)
-// urls - an array of URLs to load
-// processor - a function to execute to process each document
-// done - a function to execute when all document processing is complete
-// exception - a function to execute if an exception occurs (exceptions are
-// also logged in the Scholar for Firefox log)
-Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
- var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
- var myWindow = this.window;
- var prevUrl, url;
- Scholar.debug("processDocuments called");
-
- try {
- if (urls.length == 0) {
- if(firstDoc) {
- processor(firstDoc, done);
- } else {
- done();
- }
- return;
- }
-
- var urlIndex = -1;
- var doLoad = function() {
- urlIndex++;
- if (urlIndex < urls.length) {
- url = urls[urlIndex];
- if(this.proxiedURL) {
- url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
- }
- try {
- Scholar.debug("loading "+url);
- hiddenBrowser.loadURI(url);
- } catch (e) {
- Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);
- exception(e);
- }
- } else {
- hiddenBrowser.removeEventListener("load", onLoad, true);
- Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
- done();
- }
- };
- var onLoad = function() {
- Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
- if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
- prevUrl = hiddenBrowser.contentDocument.location.href;
- try {
- var newHiddenBrowser = new Object();
- newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
- newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
- processor(newHiddenBrowser);
- } catch (e) {
- Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
- exception(e);
- }
- doLoad();
- }
- };
- var init = function() {
- Scholar.debug("init called");
- hiddenBrowser.addEventListener("load", onLoad, true);
-
- if (firstDoc) {
- Scholar.debug("processing");
- processor(firstDoc, doLoad);
- } else {
- Scholar.debug("doing load");
- doLoad();
- }
- }
-
- init();
- } catch (e) {
- Scholar.debug("processDocuments: " + e);
- exception(e);
- }
-}
-
-// Appears to look for links in a document containing a certain substring
-Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) {
- var urls = [];
- var addedURLs = [];
-
- var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
- var aElement = aElements.iterateNext();
- while (aElement) {
- var href = aElement.href;
- if (href.indexOf(substring) >= 0 && !(addedURLs[href])) {
- urls.unshift(href);
- addedURLs[href] = true;
- }
- aElement = aElements.iterateNext();
- }
- return urls;
-}
-
-// For now, we're going to skip the getLLsFromAddresses function (which gets
-// latitude and longitude pairs from a series of addresses, but requires the
-// big mess of Java code that is the Piggy Bank server) and the geoHelper
-// tools (which rely on getLLsFromAddresses) since these are probably not
-// essential components for Scholar and would take a great deal of effort to
-// implement. We can, however, always implement them later.
-
-/*
- * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS
- * Functions below this point are extensions to the utilities provided by
- * Piggy Bank. When used in external code, the repository will need to add
- * a function definition when exporting in Piggy Bank format.
- */
-
-/*
- * Converts a JavaScript date object to an ISO-style date
- */
-Scholar.Ingester.Utilities.prototype.dateToISO = function(jsDate) {
- var date = "";
- var year = jsDate.getFullYear().toString();
- var month = (jsDate.getMonth()+1).toString();
- var day = jsDate.getDate().toString();
-
- for(var i = year.length; i<4; i++) {
- date += "0";
- }
- date += year+"-";
-
- if(month.length == 1) {
- date += "0";
- }
- date += month+"-";
-
- if(day.length == 1) {
- date += "0";
- }
- date += day;
-
- return date;
-}
-
-/*
- * Gets a given node (assumes only one value)
- */
-Scholar.Ingester.Utilities.prototype.getNode = function(doc, contextNode, xpath, nsResolver) {
- return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext();
-}
-
-/*
- * Gets a given node as a string containing all child nodes
- */
-Scholar.Ingester.Utilities.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) {
- var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver);
- var returnVar = "";
- for(var i=0; i<elmts.length; i++) {
- returnVar += elmts[i].nodeValue;
- }
- return returnVar;
-}
-
-/*
- * Cleans extraneous punctuation off an author name
- */
-Scholar.Ingester.Utilities.prototype.cleanAuthor = function(author) {
- author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
- author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
- author = author.replace(/ +/, ' ');
- // Add period for initials
- if(author.substring(author.length-2, author.length-1) == " ") {
- author += ".";
- }
- var splitNames = author.split(', ');
- if(splitNames.length > 1) {
- author = splitNames[1]+' '+splitNames[0];
- }
- return author;
-}
-
-/*
- * Cleans whitespace off a string and replaces multiple spaces with one
- */
-Scholar.Ingester.Utilities.prototype.cleanString = function(s) {
- s = s.replace(/[ \xA0]+/g, " ");
- return this.trimString(s);
-}
-
-/*
- * Cleans any non-world non-parenthesis characters off the ends of a string
- */
-Scholar.Ingester.Utilities.prototype.superCleanString = function(x) {
- var x = x.replace(/^[^\w(]+/, "");
- return x.replace(/[^\w)]+$/, "");
-}
-
-/*
- * Eliminates HTML tags, replacing <br>s with /ns
- */
-Scholar.Ingester.Utilities.prototype.cleanTags = function(x) {
- x = x.replace(/<br[^>]*>/gi, "\n");
- return x.replace(/<[^>]+>/g, "");
-}
-
-/*
- * Allows a user to select which items to scrape
- */
-Scholar.Ingester.Utilities.prototype.selectItems = function(itemList) {
- // mozillazine made me do it! honest!
- var io = { dataIn:itemList, dataOut:null }
- var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
- "_blank","chrome,modal,centerscreen,resizable=yes", io);
- return io.dataOut;
-}
-
-/*
- * Grabs items based on URLs
- */
-Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe, rejectRe) {
- var availableItems = new Object(); // Technically, associative arrays are objects
-
- // Require link to match this
- if(urlRe) {
- var urlRegexp = new RegExp();
- urlRegexp.compile(urlRe, "i");
- }
- // Do not allow text to match this
- if(rejectRe) {
- var rejectRegexp = new RegExp();
- rejectRegexp.compile(rejectRe, "i");
- }
-
- if(!inHere.length) {
- inHere = new Array(inHere);
- }
-
- for(var j=0; j<inHere.length; j++) {
- var links = inHere[j].getElementsByTagName("a");
- for(var i=0; i<links.length; i++) {
- if(!urlRe || urlRegexp.test(links[i].href)) {
- var text = this.getNodeString(doc, links[i], './/text()', null);
- if(text) {
- text = this.cleanString(text);
- if(!rejectRe || !rejectRegexp.test(text)) {
- if(availableItems[links[i].href]) {
- if(text != availableItems[links[i].href]) {
- availableItems[links[i].href] += " "+text;
- }
- } else {
- availableItems[links[i].href] = text;
- }
- }
- }
- }
- }
- }
-
- return availableItems;
-}
-
-// These functions are for use by importMARCRecord. They're private, because,
-// while they are useful, it's also nice if as many of our scrapers as possible
-// are PiggyBank compatible, and if our scrapers used functions, that would
-// break compatibility
-Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) {
- author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
- author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
- return author.replace(/ +/, ' ');
-}
-
-Scholar.Ingester.Utilities.prototype._MARCCleanNumber = function(author) {
- author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
- author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
- var regexp = /^[^ ]*/;
- var m = regexp.exec(author);
- if(m) {
- return m[0];
- }
-}
-Scholar.Ingester.Utilities.prototype._MARCPullYear = function(text) {
- var pullRe = /[0-9]+/;
- var m = pullRe.exec(text);
- if(m) {
- return m[0];
- }
-}
-
-Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
- if(!part) {
- part = 'a';
- }
- var field = record.get_field_subfields(fieldNo);
- Scholar.debug('Found '+field.length+' matches for '+fieldNo+part);
- if(field) {
- for(i in field) {
- var value;
- for(var j=0; j<part.length; j++) {
- var myPart = part.substr(j, 1);
- if(field[i][myPart]) {
- if(value) {
- value += " "+field[i][myPart];
- } else {
- value = field[i][myPart];
- }
- }
- }
- if(value) {
- if(execMe) {
- value = execMe(value);
- }
- if(prefix) {
- value = prefix + value;
- }
- model.addStatement(uri, rdfUri, value);
- }
- }
- }
- return model;
-}
-
-// This is an extension to PiggyBank's architecture. It's here so that we don't
-// need an enormous library for each scraper that wants to use MARC records
-Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, model) {
- var prefixDC = 'http://purl.org/dc/elements/1.1/';
- var prefixDCMI = 'http://purl.org/dc/dcmitype/';
- var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
- var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
-
- // Extract ISBNs
- model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN ');
- // Extract ISSNs
- model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
- // Extract creators
- model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
- model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString);
- model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString);
- model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
- model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString);
- model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString);
- if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) {
- // some LOC entries have no listed author, but have the author in the person subject field as the first entry
- var field = record.get_field_subfields('600');
- if(field[0]) {
- model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));
- }
- }
- // Extract title
- model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString, '', 'ab');
- // Extract edition
- model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString);
- // Extract place info
- model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a');
- // Extract publisher info
- model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b');
- // Extract year
- model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCPullYear, '', 'c');
- // Extract series
- model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString);
- // Extract call number
- model = this._MARCAssociateField(record, uri, model, '050', prefixDC + 'identifier', this._MARCCleanString, 'LCC ', 'ab');
- model = this._MARCAssociateField(record, uri, model, '060', prefixDC + 'identifier', this._MARCCleanString, 'NLM ', 'ab');
- model = this._MARCAssociateField(record, uri, model, '070', prefixDC + 'identifier', this._MARCCleanString, 'NAL ', 'ab');
- model = this._MARCAssociateField(record, uri, model, '080', prefixDC + 'identifier', this._MARCCleanString, 'UDC ', 'ab');
- model = this._MARCAssociateField(record, uri, model, '082', prefixDC + 'identifier', this._MARCCleanString, 'DDC ', 'a');
- model = this._MARCAssociateField(record, uri, model, '084', prefixDC + 'identifier', this._MARCCleanString, 'CN ', 'ab');
-
- // Set type
- model = model.addStatement(uri, prefixRDF + 'type', prefixDummy + "book", true);
-}
-
-/*
- * END SCHOLAR FOR FIREFOX EXTENSIONS
- */
-
-// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
-// accessed outside the sandbox, and even if it could, it wouldn't let scripts
-// access across domains, so everything's replicated here.
-Scholar.Ingester.HTTPUtilities = function(contentWindow, proxiedURL) {
- this.window = contentWindow;
- this.proxiedURL = proxiedURL;
-}
-
-Scholar.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) {
- if(this.proxiedURL) {
- url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
- }
-
- var xmlhttp = new this.window.XMLHttpRequest();
-
- xmlhttp.open('GET', url, true);
- xmlhttp.overrideMimeType("text/plain");
-
- var me = this;
- xmlhttp.onreadystatechange = function() {
- me.stateChange(xmlhttp, onStatus, onDone);
- };
- xmlhttp.send(null);
-}
-
-Scholar.Ingester.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) {
- if(this.proxiedURL) {
- url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
- }
-
- var xmlhttp = new this.window.XMLHttpRequest();
-
- xmlhttp.open('POST', url, true);
- xmlhttp.overrideMimeType("text/plain");
-
- var me = this;
- xmlhttp.onreadystatechange = function() {
- me.stateChange(xmlhttp, onStatus, onDone);
- };
- xmlhttp.send(body);
-}
-
-Scholar.Ingester.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) {
- if(this.proxiedURL) {
- url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
- }
-
- var xmlhttp = new this.window.XMLHttpRequest();
-
- xmlhttp.open('OPTIONS', url, true);
- xmlhttp.overrideMimeType("text/plain");
-
- var me = this;
- xmlhttp.onreadystatechange = function() {
- me.stateChange(xmlhttp, onStatus, onDone);
- };
- xmlhttp.send(body);
-}
-
-Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) {
- switch (xmlhttp.readyState) {
-
- // Request not yet made
- case 1:
- break;
-
- // Contact established with server but nothing downloaded yet
- case 2:
- try {
- // Check for HTTP status 200
- if (xmlhttp.status != 200) {
- if (onStatus) {
- onStatus(
- xmlhttp.status,
- xmlhttp.statusText,
- xmlhttp
- );
- xmlhttp.abort();
- }
- }
- } catch (e) {
- Scholar.debug(e, 2);
- }
- break;
-
- // Called multiple while downloading in progress
- case 3:
- break;
-
- // Download complete
- case 4:
- try {
- if (onDone) {
- onDone(xmlhttp.responseText, xmlhttp);
- }
- } catch (e) {
- Scholar.debug(e, 2);
- }
- break;
- }
-}
-
//////////////////////////////////////////////////////////////////////////////
//
// Scholar.Ingester.Document
@@ -854,7 +315,7 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
* model - the object representing the RDF model of data to be returned
* (see Scholar.Ingester.Model)
* utilities - a set of utilities for making certain tasks easier
- * (see Scholar.Ingester.Utilities);
+ * (see Scholar.Utilities);
*
* Piggy Bank/FS also offers two functions to simplify asynchronous requests
* (these will only be available for scraping, and not for scrape detection)
@@ -889,8 +350,8 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
this._sandbox.browser = this.browser;
this._sandbox.doc = this.browser.contentDocument;
this._sandbox.url = this.url;
- this._sandbox.utilities = new Scholar.Ingester.Utilities(this.window, this.proxiedURL);
- this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow, this.proxiedURL);
+ this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL);
+ this._sandbox.utilities.HTTPUtilities = new Scholar.Utilities.HTTP(this._appSvc.hiddenDOMWindow, this.proxiedURL);
this._sandbox.window = this.window;
this._sandbox.model = this.model;
this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js
@@ -0,0 +1,564 @@
+// Scholar for Firefox Utilities
+// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed)
+// This code is licensed according to the GPL
+
+/////////////////////////////////////////////////////////////////
+//
+// Scholar.Utilities
+//
+/////////////////////////////////////////////////////////////////
+// Scholar.Utilities class, a set of methods to assist in data
+// extraction. Some of the code here was stolen directly from the Piggy Bank
+// project.
+
+Scholar.Utilities = function () {}
+
+// Adapter for Piggy Bank function to print debug messages; log level is
+// fixed at 4 (could change this)
+Scholar.Utilities.prototype.debugPrint = function(msg) {
+ Scholar.debug(msg, 4);
+}
+
+// Appears to trim a string, chopping of newlines/spacing
+Scholar.Utilities.prototype.trimString = function(s) {
+ var i = 0;
+ var spaceChars = " \n\r\t" + String.fromCharCode(160) /* */;
+ while (i < s.length) {
+ var c = s.charAt(i);
+ if (spaceChars.indexOf(c) < 0) {
+ break;
+ }
+ i++;
+ }
+
+ s = s.substring(i);
+
+ i = s.length;
+ while (i > 0) {
+ var c = s.charAt(i - 1);
+ if (spaceChars.indexOf(c) < 0) {
+ break;
+ }
+ i--;
+ }
+
+ return s.substring(0, i);
+}
+
+/*
+ * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS
+ * Functions below this point are extensions to the utilities provided by
+ * Piggy Bank. When used in external code, the repository will need to add
+ * a function definition when exporting in Piggy Bank format.
+ */
+
+/*
+ * Converts a JavaScript date object to an ISO-style date
+ */
+Scholar.Utilities.prototype.dateToISO = function(jsDate) {
+ var date = "";
+ var year = jsDate.getFullYear().toString();
+ var month = (jsDate.getMonth()+1).toString();
+ var day = jsDate.getDate().toString();
+
+ for(var i = year.length; i<4; i++) {
+ date += "0";
+ }
+ date += year+"-";
+
+ if(month.length == 1) {
+ date += "0";
+ }
+ date += month+"-";
+
+ if(day.length == 1) {
+ date += "0";
+ }
+ date += day;
+
+ return date;
+}
+
+/*
+ * Cleans extraneous punctuation off an author name
+ */
+Scholar.Utilities.prototype.cleanAuthor = function(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
+ author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
+ author = author.replace(/ +/, ' ');
+ // Add period for initials
+ if(author.substring(author.length-2, author.length-1) == " ") {
+ author += ".";
+ }
+ var splitNames = author.split(', ');
+ if(splitNames.length > 1) {
+ author = splitNames[1]+' '+splitNames[0];
+ }
+ return author;
+}
+
+/*
+ * Cleans whitespace off a string and replaces multiple spaces with one
+ */
+Scholar.Utilities.prototype.cleanString = function(s) {
+ s = s.replace(/[ \xA0]+/g, " ");
+ return this.trimString(s);
+}
+
+/*
+ * Cleans any non-word non-parenthesis characters off the ends of a string
+ */
+Scholar.Utilities.prototype.superCleanString = function(x) {
+ var x = x.replace(/^[^\w(]+/, "");
+ return x.replace(/[^\w)]+$/, "");
+}
+
+/*
+ * Eliminates HTML tags, replacing <br>s with /ns
+ */
+Scholar.Utilities.prototype.cleanTags = function(x) {
+ x = x.replace(/<br[^>]*>/gi, "\n");
+ return x.replace(/<[^>]+>/g, "");
+}
+
+// These functions are for use by importMARCRecord. They're private, because,
+// while they are useful, it's also nice if as many of our scrapers as possible
+// are PiggyBank compatible, and if our scrapers used functions, that would
+// break compatibility
+Scholar.Utilities.prototype._MARCCleanString = function(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
+ author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+ return author.replace(/ +/, ' ');
+}
+
+Scholar.Utilities.prototype._MARCCleanNumber = function(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
+ author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+ var regexp = /^[^ ]*/;
+ var m = regexp.exec(author);
+ if(m) {
+ return m[0];
+ }
+}
+Scholar.Utilities.prototype._MARCPullYear = function(text) {
+ var pullRe = /[0-9]+/;
+ var m = pullRe.exec(text);
+ if(m) {
+ return m[0];
+ }
+}
+
+Scholar.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
+ if(!part) {
+ part = 'a';
+ }
+ var field = record.get_field_subfields(fieldNo);
+ Scholar.debug('Found '+field.length+' matches for '+fieldNo+part);
+ if(field) {
+ for(i in field) {
+ var value;
+ for(var j=0; j<part.length; j++) {
+ var myPart = part.substr(j, 1);
+ if(field[i][myPart]) {
+ if(value) {
+ value += " "+field[i][myPart];
+ } else {
+ value = field[i][myPart];
+ }
+ }
+ }
+ if(value) {
+ if(execMe) {
+ value = execMe(value);
+ }
+ if(prefix) {
+ value = prefix + value;
+ }
+ model.addStatement(uri, rdfUri, value);
+ }
+ }
+ }
+ return model;
+}
+
+// This is an extension to PiggyBank's architecture. It's here so that we don't
+// need an enormous library for each scraper that wants to use MARC records
+Scholar.Utilities.prototype.importMARCRecord = function(record, uri, model) {
+ var prefixDC = 'http://purl.org/dc/elements/1.1/';
+ var prefixDCMI = 'http://purl.org/dc/dcmitype/';
+ var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
+ var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
+
+ // Extract ISBNs
+ model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN ');
+ // Extract ISSNs
+ model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
+ // Extract creators
+ model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor);
+ model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString);
+ model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString);
+ model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor);
+ model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString);
+ model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString);
+ if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) {
+ // some LOC entries have no listed author, but have the author in the person subject field as the first entry
+ var field = record.get_field_subfields('600');
+ if(field[0]) {
+ model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a']));
+ }
+ }
+ // Extract title
+ model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString, '', 'ab');
+ // Extract edition
+ model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString);
+ // Extract place info
+ model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a');
+ // Extract publisher info
+ model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b');
+ // Extract year
+ model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCPullYear, '', 'c');
+ // Extract series
+ model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString);
+ // Extract call number
+ model = this._MARCAssociateField(record, uri, model, '050', prefixDC + 'identifier', this._MARCCleanString, 'LCC ', 'ab');
+ model = this._MARCAssociateField(record, uri, model, '060', prefixDC + 'identifier', this._MARCCleanString, 'NLM ', 'ab');
+ model = this._MARCAssociateField(record, uri, model, '070', prefixDC + 'identifier', this._MARCCleanString, 'NAL ', 'ab');
+ model = this._MARCAssociateField(record, uri, model, '080', prefixDC + 'identifier', this._MARCCleanString, 'UDC ', 'ab');
+ model = this._MARCAssociateField(record, uri, model, '082', prefixDC + 'identifier', this._MARCCleanString, 'DDC ', 'a');
+ model = this._MARCAssociateField(record, uri, model, '084', prefixDC + 'identifier', this._MARCCleanString, 'CN ', 'ab');
+
+ // Set type
+ model = model.addStatement(uri, prefixRDF + 'type', prefixDummy + "book", true);
+}
+
+/*
+ * END SCHOLAR FOR FIREFOX EXTENSIONS
+ */
+
+/////////////////////////////////////////////////////////////////
+//
+// Scholar.Utilities.Ingester
+//
+/////////////////////////////////////////////////////////////////
+// Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional
+// classes relating to data extraction specifically from HTML documents.
+
+Scholar.Utilities.Ingester = function(myWindow, proxiedURL) {
+ this.window = myWindow;
+ this.proxiedURL = proxiedURL;
+}
+
+Scholar.Utilities.Ingester.prototype = new Scholar.Utilities();
+
+// Takes an XPath query and returns the results
+Scholar.Utilities.Ingester.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) {
+ var elmts = [];
+
+ var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
+ var elmt = iterator.iterateNext();
+ var i = 0;
+ while (elmt) {
+ elmts[i++] = elmt;
+ elmt = iterator.iterateNext();
+ }
+ return elmts;
+}
+
+// Loads a single document for a scraper, running succeeded() on success or
+// failed() on failure
+Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) {
+ Scholar.debug("loadDocument called");
+ this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
+}
+
+// Downloads and processes documents with processor()
+// browser - a browser object
+// firstDoc - the first document to process with the processor (if null,
+// first document is processed without processor)
+// urls - an array of URLs to load
+// processor - a function to execute to process each document
+// done - a function to execute when all document processing is complete
+// exception - a function to execute if an exception occurs (exceptions are
+// also logged in the Scholar for Firefox log)
+Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
+ var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
+ var myWindow = this.window;
+ var prevUrl, url;
+ Scholar.debug("processDocuments called");
+
+ try {
+ if (urls.length == 0) {
+ if(firstDoc) {
+ processor(firstDoc, done);
+ } else {
+ done();
+ }
+ return;
+ }
+
+ var urlIndex = -1;
+ var doLoad = function() {
+ urlIndex++;
+ if (urlIndex < urls.length) {
+ url = urls[urlIndex];
+ if(this.proxiedURL) {
+ url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
+ }
+ try {
+ Scholar.debug("loading "+url);
+ hiddenBrowser.loadURI(url);
+ } catch (e) {
+ Scholar.debug("Scholar.Utilities.Ingester.processDocuments doLoad: " + e, 2);
+ exception(e);
+ }
+ } else {
+ hiddenBrowser.removeEventListener("load", onLoad, true);
+ Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
+ done();
+ }
+ };
+ var onLoad = function() {
+ Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
+ if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
+ prevUrl = hiddenBrowser.contentDocument.location.href;
+ try {
+ var newHiddenBrowser = new Object();
+ newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
+ newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
+ processor(newHiddenBrowser);
+ } catch (e) {
+ Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2);
+ exception(e);
+ }
+ doLoad();
+ }
+ };
+ var init = function() {
+ Scholar.debug("init called");
+ hiddenBrowser.addEventListener("load", onLoad, true);
+
+ if (firstDoc) {
+ Scholar.debug("processing");
+ processor(firstDoc, doLoad);
+ } else {
+ Scholar.debug("doing load");
+ doLoad();
+ }
+ }
+
+ init();
+ } catch (e) {
+ Scholar.debug("processDocuments: " + e);
+ exception(e);
+ }
+}
+
+// Appears to look for links in a document containing a certain substring
+Scholar.Utilities.Ingester.prototype.collectURLsWithSubstring = function(doc, substring) {
+ var urls = [];
+ var addedURLs = [];
+
+ var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
+ var aElement = aElements.iterateNext();
+ while (aElement) {
+ var href = aElement.href;
+ if (href.indexOf(substring) >= 0 && !(addedURLs[href])) {
+ urls.unshift(href);
+ addedURLs[href] = true;
+ }
+ aElement = aElements.iterateNext();
+ }
+ return urls;
+}
+
+// For now, we're going to skip the getLLsFromAddresses function (which gets
+// latitude and longitude pairs from a series of addresses, but requires the
+// big mess of Java code that is the Piggy Bank server) and the geoHelper
+// tools (which rely on getLLsFromAddresses) since these are probably not
+// essential components for Scholar and would take a great deal of effort to
+// implement. We can, however, always implement them later.
+
+/*
+ * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS
+ */
+
+/*
+ * Gets a given node (assumes only one value)
+ */
+Scholar.Utilities.Ingester.prototype.getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext();
+}
+
+/*
+ * Gets a given node as a string containing all child nodes
+ */
+Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) {
+ var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver);
+ var returnVar = "";
+ for(var i=0; i<elmts.length; i++) {
+ returnVar += elmts[i].nodeValue;
+ }
+ return returnVar;
+}
+
+/*
+ * Allows a user to select which items to scrape
+ */
+Scholar.Utilities.Ingester.prototype.selectItems = function(itemList) {
+ // mozillazine made me do it! honest!
+ var io = { dataIn:itemList, dataOut:null }
+ var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul",
+ "_blank","chrome,modal,centerscreen,resizable=yes", io);
+ return io.dataOut;
+}
+
+/*
+ * Grabs items based on URLs
+ */
+Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe, rejectRe) {
+ var availableItems = new Object(); // Technically, associative arrays are objects
+
+ // Require link to match this
+ if(urlRe) {
+ var urlRegexp = new RegExp();
+ urlRegexp.compile(urlRe, "i");
+ }
+ // Do not allow text to match this
+ if(rejectRe) {
+ var rejectRegexp = new RegExp();
+ rejectRegexp.compile(rejectRe, "i");
+ }
+
+ if(!inHere.length) {
+ inHere = new Array(inHere);
+ }
+
+ for(var j=0; j<inHere.length; j++) {
+ var links = inHere[j].getElementsByTagName("a");
+ for(var i=0; i<links.length; i++) {
+ if(!urlRe || urlRegexp.test(links[i].href)) {
+ var text = this.getNodeString(doc, links[i], './/text()', null);
+ if(text) {
+ text = this.cleanString(text);
+ if(!rejectRe || !rejectRegexp.test(text)) {
+ if(availableItems[links[i].href]) {
+ if(text != availableItems[links[i].href]) {
+ availableItems[links[i].href] += " "+text;
+ }
+ } else {
+ availableItems[links[i].href] = text;
+ }
+ }
+ }
+ }
+ }
+ }
+
+ return availableItems;
+}
+
+/*
+ * END SCHOLAR FOR FIREFOX EXTENSIONS
+ */
+
+// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be
+// accessed outside the sandbox, and even if it could, it wouldn't let scripts
+// access across domains, so everything's replicated here.
+Scholar.Utilities.HTTP = function(contentWindow, proxiedURL) {
+ this.window = contentWindow;
+ this.proxiedURL = proxiedURL;
+}
+
+Scholar.Utilities.HTTP.prototype.doGet = function(url, onStatus, onDone) {
+ if(this.proxiedURL) {
+ url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
+ }
+
+ var xmlhttp = new this.window.XMLHttpRequest();
+
+ xmlhttp.open('GET', url, true);
+ xmlhttp.overrideMimeType("text/plain");
+
+ var me = this;
+ xmlhttp.onreadystatechange = function() {
+ me.stateChange(xmlhttp, onStatus, onDone);
+ };
+ xmlhttp.send(null);
+}
+
+Scholar.Utilities.HTTP.prototype.doPost = function(url, body, onStatus, onDone) {
+ if(this.proxiedURL) {
+ url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
+ }
+
+ var xmlhttp = new this.window.XMLHttpRequest();
+
+ xmlhttp.open('POST', url, true);
+ xmlhttp.overrideMimeType("text/plain");
+
+ var me = this;
+ xmlhttp.onreadystatechange = function() {
+ me.stateChange(xmlhttp, onStatus, onDone);
+ };
+ xmlhttp.send(body);
+}
+
+Scholar.Utilities.HTTP.prototype.doOptions = function(url, body, onStatus, onDone) {
+ if(this.proxiedURL) {
+ url = Scholar.Ingester.ProxyMonitor.properToProxy(url);
+ }
+
+ var xmlhttp = new this.window.XMLHttpRequest();
+
+ xmlhttp.open('OPTIONS', url, true);
+ xmlhttp.overrideMimeType("text/plain");
+
+ var me = this;
+ xmlhttp.onreadystatechange = function() {
+ me.stateChange(xmlhttp, onStatus, onDone);
+ };
+ xmlhttp.send(body);
+}
+
+Scholar.Utilities.HTTP.prototype.stateChange = function(xmlhttp, onStatus, onDone) {
+ switch (xmlhttp.readyState) {
+
+ // Request not yet made
+ case 1:
+ break;
+
+ // Contact established with server but nothing downloaded yet
+ case 2:
+ try {
+ // Check for HTTP status 200
+ if (xmlhttp.status != 200) {
+ if (onStatus) {
+ onStatus(
+ xmlhttp.status,
+ xmlhttp.statusText,
+ xmlhttp
+ );
+ xmlhttp.abort();
+ }
+ }
+ } catch (e) {
+ Scholar.debug(e, 2);
+ }
+ break;
+
+ // Called multiple while downloading in progress
+ case 3:
+ break;
+
+ // Download complete
+ case 4:
+ try {
+ if (onDone) {
+ onDone(xmlhttp.responseText, xmlhttp);
+ }
+ } catch (e) {
+ Scholar.debug(e, 2);
+ }
+ break;
+ }
+}
+\ No newline at end of file
diff --git a/components/chnmIScholarService.js b/components/chnmIScholarService.js
@@ -45,6 +45,10 @@ Cc["@mozilla.org/moz/jssubscript-loader;1"]
Cc["@mozilla.org/moz/jssubscript-loader;1"]
.getService(Ci.mozIJSSubScriptLoader)
.loadSubScript("chrome://scholar/content/xpcom/marc.js");
+
+Cc["@mozilla.org/moz/jssubscript-loader;1"]
+ .getService(Ci.mozIJSSubScriptLoader)
+ .loadSubScript("chrome://scholar/content/xpcom/utilities.js");
/********************************************************************/