www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 714885295539dfb4e47576775b2a85a93eaa91c2
parent 1096a95f62fa477deacf25f033b5ec89acbad250
Author: Simon Kornblith <simon@simonster.com>
Date:   Mon, 26 Jun 2006 14:46:57 +0000

make generic Scholar.Utilities class and HTTP-dependent Scholar.Utilities.Ingester and Scholar.Utilities.HTTP classes in preparation for import/export filters; split off into separate javascript file


Diffstat:
Mchrome/chromeFiles/content/scholar/xpcom/ingester.js | 547+------------------------------------------------------------------------------
Achrome/chromeFiles/content/scholar/xpcom/utilities.js | 565+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Mcomponents/chnmIScholarService.js | 4++++
3 files changed, 573 insertions(+), 543 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -2,7 +2,7 @@ // Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) // This code is licensed according to the GPL -Scholar.Ingester = new function() {} +Scholar.Ingester = new Object(); Scholar.Ingester.createHiddenBrowser = function(myWindow) { // Create a hidden browser @@ -165,545 +165,6 @@ Scholar.Ingester.Model.prototype.addTag = function() {} Scholar.Ingester.Model.prototype.getRepository = function() {} Scholar.Ingester.Model.prototype.detachRepository = function() {} -///////////////////////////////////////////////////////////////// -// -// Scholar.Ingester.Utilities -// -///////////////////////////////////////////////////////////////// -// Scholar.Ingester.Utilities class, a set of methods to assist in data -// extraction. Most code here was stolen directly from the Piggy Bank project. -Scholar.Ingester.Utilities = function(myWindow, proxiedURL) { - this.window = myWindow; - this.proxiedURL = proxiedURL; -} - -// Adapter for Piggy Bank function to print debug messages; log level is -// fixed at 4 (could change this) -Scholar.Ingester.Utilities.prototype.debugPrint = function(msg) { - Scholar.debug(msg, 4); -} - -// Appears to trim a string, chopping of newlines/spacing -Scholar.Ingester.Utilities.prototype.trimString = function(s) { - var i = 0; - var spaceChars = " \n\r\t" + String.fromCharCode(160) /* &nbsp; */; - while (i < s.length) { - var c = s.charAt(i); - if (spaceChars.indexOf(c) < 0) { - break; - } - i++; - } - - s = s.substring(i); - - i = s.length; - while (i > 0) { - var c = s.charAt(i - 1); - if (spaceChars.indexOf(c) < 0) { - break; - } - i--; - } - - return s.substring(0, i); -} - -// Takes an XPath query and returns the results -Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) { - var elmts = []; - - var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); - var elmt = iterator.iterateNext(); - var i = 0; - while (elmt) { - elmts[i++] = elmt; - elmt = iterator.iterateNext(); - } - return elmts; -} - -// Loads a single document for a scraper, running succeeded() on success or -// failed() on failure -Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) { - Scholar.debug("loadDocument called"); - this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed); -} - -// Downloads and processes documents with processor() -// browser - a browser object -// firstDoc - the first document to process with the processor (if null, -// first document is processed without processor) -// urls - an array of URLs to load -// processor - a function to execute to process each document -// done - a function to execute when all document processing is complete -// exception - a function to execute if an exception occurs (exceptions are -// also logged in the Scholar for Firefox log) -Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { - var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window); - var myWindow = this.window; - var prevUrl, url; - Scholar.debug("processDocuments called"); - - try { - if (urls.length == 0) { - if(firstDoc) { - processor(firstDoc, done); - } else { - done(); - } - return; - } - - var urlIndex = -1; - var doLoad = function() { - urlIndex++; - if (urlIndex < urls.length) { - url = urls[urlIndex]; - if(this.proxiedURL) { - url = Scholar.Ingester.ProxyMonitor.properToProxy(url); - } - try { - Scholar.debug("loading "+url); - hiddenBrowser.loadURI(url); - } catch (e) { - Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2); - exception(e); - } - } else { - hiddenBrowser.removeEventListener("load", onLoad, true); - Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser); - done(); - } - }; - var onLoad = function() { - Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded"); - if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times - prevUrl = hiddenBrowser.contentDocument.location.href; - try { - var newHiddenBrowser = new Object(); - newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; - newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; - processor(newHiddenBrowser); - } catch (e) { - Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2); - exception(e); - } - doLoad(); - } - }; - var init = function() { - Scholar.debug("init called"); - hiddenBrowser.addEventListener("load", onLoad, true); - - if (firstDoc) { - Scholar.debug("processing"); - processor(firstDoc, doLoad); - } else { - Scholar.debug("doing load"); - doLoad(); - } - } - - init(); - } catch (e) { - Scholar.debug("processDocuments: " + e); - exception(e); - } -} - -// Appears to look for links in a document containing a certain substring -Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, substring) { - var urls = []; - var addedURLs = []; - - var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); - var aElement = aElements.iterateNext(); - while (aElement) { - var href = aElement.href; - if (href.indexOf(substring) >= 0 && !(addedURLs[href])) { - urls.unshift(href); - addedURLs[href] = true; - } - aElement = aElements.iterateNext(); - } - return urls; -} - -// For now, we're going to skip the getLLsFromAddresses function (which gets -// latitude and longitude pairs from a series of addresses, but requires the -// big mess of Java code that is the Piggy Bank server) and the geoHelper -// tools (which rely on getLLsFromAddresses) since these are probably not -// essential components for Scholar and would take a great deal of effort to -// implement. We can, however, always implement them later. - -/* - * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS - * Functions below this point are extensions to the utilities provided by - * Piggy Bank. When used in external code, the repository will need to add - * a function definition when exporting in Piggy Bank format. - */ - -/* - * Converts a JavaScript date object to an ISO-style date - */ -Scholar.Ingester.Utilities.prototype.dateToISO = function(jsDate) { - var date = ""; - var year = jsDate.getFullYear().toString(); - var month = (jsDate.getMonth()+1).toString(); - var day = jsDate.getDate().toString(); - - for(var i = year.length; i<4; i++) { - date += "0"; - } - date += year+"-"; - - if(month.length == 1) { - date += "0"; - } - date += month+"-"; - - if(day.length == 1) { - date += "0"; - } - date += day; - - return date; -} - -/* - * Gets a given node (assumes only one value) - */ -Scholar.Ingester.Utilities.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { - return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); -} - -/* - * Gets a given node as a string containing all child nodes - */ -Scholar.Ingester.Utilities.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) { - var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver); - var returnVar = ""; - for(var i=0; i<elmts.length; i++) { - returnVar += elmts[i].nodeValue; - } - return returnVar; -} - -/* - * Cleans extraneous punctuation off an author name - */ -Scholar.Ingester.Utilities.prototype.cleanAuthor = function(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); - author = author.replace(/ +/, ' '); - // Add period for initials - if(author.substring(author.length-2, author.length-1) == " ") { - author += "."; - } - var splitNames = author.split(', '); - if(splitNames.length > 1) { - author = splitNames[1]+' '+splitNames[0]; - } - return author; -} - -/* - * Cleans whitespace off a string and replaces multiple spaces with one - */ -Scholar.Ingester.Utilities.prototype.cleanString = function(s) { - s = s.replace(/[ \xA0]+/g, " "); - return this.trimString(s); -} - -/* - * Cleans any non-world non-parenthesis characters off the ends of a string - */ -Scholar.Ingester.Utilities.prototype.superCleanString = function(x) { - var x = x.replace(/^[^\w(]+/, ""); - return x.replace(/[^\w)]+$/, ""); -} - -/* - * Eliminates HTML tags, replacing <br>s with /ns - */ -Scholar.Ingester.Utilities.prototype.cleanTags = function(x) { - x = x.replace(/<br[^>]*>/gi, "\n"); - return x.replace(/<[^>]+>/g, ""); -} - -/* - * Allows a user to select which items to scrape - */ -Scholar.Ingester.Utilities.prototype.selectItems = function(itemList) { - // mozillazine made me do it! honest! - var io = { dataIn:itemList, dataOut:null } - var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul", - "_blank","chrome,modal,centerscreen,resizable=yes", io); - return io.dataOut; -} - -/* - * Grabs items based on URLs - */ -Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe, rejectRe) { - var availableItems = new Object(); // Technically, associative arrays are objects - - // Require link to match this - if(urlRe) { - var urlRegexp = new RegExp(); - urlRegexp.compile(urlRe, "i"); - } - // Do not allow text to match this - if(rejectRe) { - var rejectRegexp = new RegExp(); - rejectRegexp.compile(rejectRe, "i"); - } - - if(!inHere.length) { - inHere = new Array(inHere); - } - - for(var j=0; j<inHere.length; j++) { - var links = inHere[j].getElementsByTagName("a"); - for(var i=0; i<links.length; i++) { - if(!urlRe || urlRegexp.test(links[i].href)) { - var text = this.getNodeString(doc, links[i], './/text()', null); - if(text) { - text = this.cleanString(text); - if(!rejectRe || !rejectRegexp.test(text)) { - if(availableItems[links[i].href]) { - if(text != availableItems[links[i].href]) { - availableItems[links[i].href] += " "+text; - } - } else { - availableItems[links[i].href] = text; - } - } - } - } - } - } - - return availableItems; -} - -// These functions are for use by importMARCRecord. They're private, because, -// while they are useful, it's also nice if as many of our scrapers as possible -// are PiggyBank compatible, and if our scrapers used functions, that would -// break compatibility -Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); - return author.replace(/ +/, ' '); -} - -Scholar.Ingester.Utilities.prototype._MARCCleanNumber = function(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); - var regexp = /^[^ ]*/; - var m = regexp.exec(author); - if(m) { - return m[0]; - } -} -Scholar.Ingester.Utilities.prototype._MARCPullYear = function(text) { - var pullRe = /[0-9]+/; - var m = pullRe.exec(text); - if(m) { - return m[0]; - } -} - -Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) { - if(!part) { - part = 'a'; - } - var field = record.get_field_subfields(fieldNo); - Scholar.debug('Found '+field.length+' matches for '+fieldNo+part); - if(field) { - for(i in field) { - var value; - for(var j=0; j<part.length; j++) { - var myPart = part.substr(j, 1); - if(field[i][myPart]) { - if(value) { - value += " "+field[i][myPart]; - } else { - value = field[i][myPart]; - } - } - } - if(value) { - if(execMe) { - value = execMe(value); - } - if(prefix) { - value = prefix + value; - } - model.addStatement(uri, rdfUri, value); - } - } - } - return model; -} - -// This is an extension to PiggyBank's architecture. It's here so that we don't -// need an enormous library for each scraper that wants to use MARC records -Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, model) { - var prefixDC = 'http://purl.org/dc/elements/1.1/'; - var prefixDCMI = 'http://purl.org/dc/dcmitype/'; - var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/'; - var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'; - - // Extract ISBNs - model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN '); - // Extract ISSNs - model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN '); - // Extract creators - model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor); - model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString); - model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString); - model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor); - model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString); - model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString); - if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) { - // some LOC entries have no listed author, but have the author in the person subject field as the first entry - var field = record.get_field_subfields('600'); - if(field[0]) { - model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a'])); - } - } - // Extract title - model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString, '', 'ab'); - // Extract edition - model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString); - // Extract place info - model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a'); - // Extract publisher info - model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b'); - // Extract year - model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCPullYear, '', 'c'); - // Extract series - model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString); - // Extract call number - model = this._MARCAssociateField(record, uri, model, '050', prefixDC + 'identifier', this._MARCCleanString, 'LCC ', 'ab'); - model = this._MARCAssociateField(record, uri, model, '060', prefixDC + 'identifier', this._MARCCleanString, 'NLM ', 'ab'); - model = this._MARCAssociateField(record, uri, model, '070', prefixDC + 'identifier', this._MARCCleanString, 'NAL ', 'ab'); - model = this._MARCAssociateField(record, uri, model, '080', prefixDC + 'identifier', this._MARCCleanString, 'UDC ', 'ab'); - model = this._MARCAssociateField(record, uri, model, '082', prefixDC + 'identifier', this._MARCCleanString, 'DDC ', 'a'); - model = this._MARCAssociateField(record, uri, model, '084', prefixDC + 'identifier', this._MARCCleanString, 'CN ', 'ab'); - - // Set type - model = model.addStatement(uri, prefixRDF + 'type', prefixDummy + "book", true); -} - -/* - * END SCHOLAR FOR FIREFOX EXTENSIONS - */ - -// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be -// accessed outside the sandbox, and even if it could, it wouldn't let scripts -// access across domains, so everything's replicated here. -Scholar.Ingester.HTTPUtilities = function(contentWindow, proxiedURL) { - this.window = contentWindow; - this.proxiedURL = proxiedURL; -} - -Scholar.Ingester.HTTPUtilities.prototype.doGet = function(url, onStatus, onDone) { - if(this.proxiedURL) { - url = Scholar.Ingester.ProxyMonitor.properToProxy(url); - } - - var xmlhttp = new this.window.XMLHttpRequest(); - - xmlhttp.open('GET', url, true); - xmlhttp.overrideMimeType("text/plain"); - - var me = this; - xmlhttp.onreadystatechange = function() { - me.stateChange(xmlhttp, onStatus, onDone); - }; - xmlhttp.send(null); -} - -Scholar.Ingester.HTTPUtilities.prototype.doPost = function(url, body, onStatus, onDone) { - if(this.proxiedURL) { - url = Scholar.Ingester.ProxyMonitor.properToProxy(url); - } - - var xmlhttp = new this.window.XMLHttpRequest(); - - xmlhttp.open('POST', url, true); - xmlhttp.overrideMimeType("text/plain"); - - var me = this; - xmlhttp.onreadystatechange = function() { - me.stateChange(xmlhttp, onStatus, onDone); - }; - xmlhttp.send(body); -} - -Scholar.Ingester.HTTPUtilities.prototype.doOptions = function(url, body, onStatus, onDone) { - if(this.proxiedURL) { - url = Scholar.Ingester.ProxyMonitor.properToProxy(url); - } - - var xmlhttp = new this.window.XMLHttpRequest(); - - xmlhttp.open('OPTIONS', url, true); - xmlhttp.overrideMimeType("text/plain"); - - var me = this; - xmlhttp.onreadystatechange = function() { - me.stateChange(xmlhttp, onStatus, onDone); - }; - xmlhttp.send(body); -} - -Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatus, onDone) { - switch (xmlhttp.readyState) { - - // Request not yet made - case 1: - break; - - // Contact established with server but nothing downloaded yet - case 2: - try { - // Check for HTTP status 200 - if (xmlhttp.status != 200) { - if (onStatus) { - onStatus( - xmlhttp.status, - xmlhttp.statusText, - xmlhttp - ); - xmlhttp.abort(); - } - } - } catch (e) { - Scholar.debug(e, 2); - } - break; - - // Called multiple while downloading in progress - case 3: - break; - - // Download complete - case 4: - try { - if (onDone) { - onDone(xmlhttp.responseText, xmlhttp); - } - } catch (e) { - Scholar.debug(e, 2); - } - break; - } -} - ////////////////////////////////////////////////////////////////////////////// // // Scholar.Ingester.Document @@ -854,7 +315,7 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) { * model - the object representing the RDF model of data to be returned * (see Scholar.Ingester.Model) * utilities - a set of utilities for making certain tasks easier - * (see Scholar.Ingester.Utilities); + * (see Scholar.Utilities); * * Piggy Bank/FS also offers two functions to simplify asynchronous requests * (these will only be available for scraping, and not for scrape detection) @@ -889,8 +350,8 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() { this._sandbox.browser = this.browser; this._sandbox.doc = this.browser.contentDocument; this._sandbox.url = this.url; - this._sandbox.utilities = new Scholar.Ingester.Utilities(this.window, this.proxiedURL); - this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow, this.proxiedURL); + this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL); + this._sandbox.utilities.HTTPUtilities = new Scholar.Utilities.HTTP(this._appSvc.hiddenDOMWindow, this.proxiedURL); this._sandbox.window = this.window; this._sandbox.model = this.model; this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult; diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -0,0 +1,564 @@ +// Scholar for Firefox Utilities +// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) +// This code is licensed according to the GPL + +///////////////////////////////////////////////////////////////// +// +// Scholar.Utilities +// +///////////////////////////////////////////////////////////////// +// Scholar.Utilities class, a set of methods to assist in data +// extraction. Some of the code here was stolen directly from the Piggy Bank +// project. + +Scholar.Utilities = function () {} + +// Adapter for Piggy Bank function to print debug messages; log level is +// fixed at 4 (could change this) +Scholar.Utilities.prototype.debugPrint = function(msg) { + Scholar.debug(msg, 4); +} + +// Appears to trim a string, chopping of newlines/spacing +Scholar.Utilities.prototype.trimString = function(s) { + var i = 0; + var spaceChars = " \n\r\t" + String.fromCharCode(160) /* &nbsp; */; + while (i < s.length) { + var c = s.charAt(i); + if (spaceChars.indexOf(c) < 0) { + break; + } + i++; + } + + s = s.substring(i); + + i = s.length; + while (i > 0) { + var c = s.charAt(i - 1); + if (spaceChars.indexOf(c) < 0) { + break; + } + i--; + } + + return s.substring(0, i); +} + +/* + * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS + * Functions below this point are extensions to the utilities provided by + * Piggy Bank. When used in external code, the repository will need to add + * a function definition when exporting in Piggy Bank format. + */ + +/* + * Converts a JavaScript date object to an ISO-style date + */ +Scholar.Utilities.prototype.dateToISO = function(jsDate) { + var date = ""; + var year = jsDate.getFullYear().toString(); + var month = (jsDate.getMonth()+1).toString(); + var day = jsDate.getDate().toString(); + + for(var i = year.length; i<4; i++) { + date += "0"; + } + date += year+"-"; + + if(month.length == 1) { + date += "0"; + } + date += month+"-"; + + if(day.length == 1) { + date += "0"; + } + date += day; + + return date; +} + +/* + * Cleans extraneous punctuation off an author name + */ +Scholar.Utilities.prototype.cleanAuthor = function(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); + author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); + author = author.replace(/ +/, ' '); + // Add period for initials + if(author.substring(author.length-2, author.length-1) == " ") { + author += "."; + } + var splitNames = author.split(', '); + if(splitNames.length > 1) { + author = splitNames[1]+' '+splitNames[0]; + } + return author; +} + +/* + * Cleans whitespace off a string and replaces multiple spaces with one + */ +Scholar.Utilities.prototype.cleanString = function(s) { + s = s.replace(/[ \xA0]+/g, " "); + return this.trimString(s); +} + +/* + * Cleans any non-word non-parenthesis characters off the ends of a string + */ +Scholar.Utilities.prototype.superCleanString = function(x) { + var x = x.replace(/^[^\w(]+/, ""); + return x.replace(/[^\w)]+$/, ""); +} + +/* + * Eliminates HTML tags, replacing <br>s with /ns + */ +Scholar.Utilities.prototype.cleanTags = function(x) { + x = x.replace(/<br[^>]*>/gi, "\n"); + return x.replace(/<[^>]+>/g, ""); +} + +// These functions are for use by importMARCRecord. They're private, because, +// while they are useful, it's also nice if as many of our scrapers as possible +// are PiggyBank compatible, and if our scrapers used functions, that would +// break compatibility +Scholar.Utilities.prototype._MARCCleanString = function(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); + author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); + return author.replace(/ +/, ' '); +} + +Scholar.Utilities.prototype._MARCCleanNumber = function(author) { + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); + author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); + var regexp = /^[^ ]*/; + var m = regexp.exec(author); + if(m) { + return m[0]; + } +} +Scholar.Utilities.prototype._MARCPullYear = function(text) { + var pullRe = /[0-9]+/; + var m = pullRe.exec(text); + if(m) { + return m[0]; + } +} + +Scholar.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) { + if(!part) { + part = 'a'; + } + var field = record.get_field_subfields(fieldNo); + Scholar.debug('Found '+field.length+' matches for '+fieldNo+part); + if(field) { + for(i in field) { + var value; + for(var j=0; j<part.length; j++) { + var myPart = part.substr(j, 1); + if(field[i][myPart]) { + if(value) { + value += " "+field[i][myPart]; + } else { + value = field[i][myPart]; + } + } + } + if(value) { + if(execMe) { + value = execMe(value); + } + if(prefix) { + value = prefix + value; + } + model.addStatement(uri, rdfUri, value); + } + } + } + return model; +} + +// This is an extension to PiggyBank's architecture. It's here so that we don't +// need an enormous library for each scraper that wants to use MARC records +Scholar.Utilities.prototype.importMARCRecord = function(record, uri, model) { + var prefixDC = 'http://purl.org/dc/elements/1.1/'; + var prefixDCMI = 'http://purl.org/dc/dcmitype/'; + var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/'; + var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'; + + // Extract ISBNs + model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN '); + // Extract ISSNs + model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN '); + // Extract creators + model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor); + model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString); + model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString); + model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor); + model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString); + model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString); + if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) { + // some LOC entries have no listed author, but have the author in the person subject field as the first entry + var field = record.get_field_subfields('600'); + if(field[0]) { + model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a'])); + } + } + // Extract title + model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString, '', 'ab'); + // Extract edition + model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString); + // Extract place info + model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a'); + // Extract publisher info + model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b'); + // Extract year + model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCPullYear, '', 'c'); + // Extract series + model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString); + // Extract call number + model = this._MARCAssociateField(record, uri, model, '050', prefixDC + 'identifier', this._MARCCleanString, 'LCC ', 'ab'); + model = this._MARCAssociateField(record, uri, model, '060', prefixDC + 'identifier', this._MARCCleanString, 'NLM ', 'ab'); + model = this._MARCAssociateField(record, uri, model, '070', prefixDC + 'identifier', this._MARCCleanString, 'NAL ', 'ab'); + model = this._MARCAssociateField(record, uri, model, '080', prefixDC + 'identifier', this._MARCCleanString, 'UDC ', 'ab'); + model = this._MARCAssociateField(record, uri, model, '082', prefixDC + 'identifier', this._MARCCleanString, 'DDC ', 'a'); + model = this._MARCAssociateField(record, uri, model, '084', prefixDC + 'identifier', this._MARCCleanString, 'CN ', 'ab'); + + // Set type + model = model.addStatement(uri, prefixRDF + 'type', prefixDummy + "book", true); +} + +/* + * END SCHOLAR FOR FIREFOX EXTENSIONS + */ + +///////////////////////////////////////////////////////////////// +// +// Scholar.Utilities.Ingester +// +///////////////////////////////////////////////////////////////// +// Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional +// classes relating to data extraction specifically from HTML documents. + +Scholar.Utilities.Ingester = function(myWindow, proxiedURL) { + this.window = myWindow; + this.proxiedURL = proxiedURL; +} + +Scholar.Utilities.Ingester.prototype = new Scholar.Utilities(); + +// Takes an XPath query and returns the results +Scholar.Utilities.Ingester.prototype.gatherElementsOnXPath = function(doc, parentNode, xpath, nsResolver) { + var elmts = []; + + var iterator = doc.evaluate(xpath, parentNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); + var elmt = iterator.iterateNext(); + var i = 0; + while (elmt) { + elmts[i++] = elmt; + elmt = iterator.iterateNext(); + } + return elmts; +} + +// Loads a single document for a scraper, running succeeded() on success or +// failed() on failure +Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) { + Scholar.debug("loadDocument called"); + this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed); +} + +// Downloads and processes documents with processor() +// browser - a browser object +// firstDoc - the first document to process with the processor (if null, +// first document is processed without processor) +// urls - an array of URLs to load +// processor - a function to execute to process each document +// done - a function to execute when all document processing is complete +// exception - a function to execute if an exception occurs (exceptions are +// also logged in the Scholar for Firefox log) +Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { + var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window); + var myWindow = this.window; + var prevUrl, url; + Scholar.debug("processDocuments called"); + + try { + if (urls.length == 0) { + if(firstDoc) { + processor(firstDoc, done); + } else { + done(); + } + return; + } + + var urlIndex = -1; + var doLoad = function() { + urlIndex++; + if (urlIndex < urls.length) { + url = urls[urlIndex]; + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + try { + Scholar.debug("loading "+url); + hiddenBrowser.loadURI(url); + } catch (e) { + Scholar.debug("Scholar.Utilities.Ingester.processDocuments doLoad: " + e, 2); + exception(e); + } + } else { + hiddenBrowser.removeEventListener("load", onLoad, true); + Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser); + done(); + } + }; + var onLoad = function() { + Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded"); + if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times + prevUrl = hiddenBrowser.contentDocument.location.href; + try { + var newHiddenBrowser = new Object(); + newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; + newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; + processor(newHiddenBrowser); + } catch (e) { + Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2); + exception(e); + } + doLoad(); + } + }; + var init = function() { + Scholar.debug("init called"); + hiddenBrowser.addEventListener("load", onLoad, true); + + if (firstDoc) { + Scholar.debug("processing"); + processor(firstDoc, doLoad); + } else { + Scholar.debug("doing load"); + doLoad(); + } + } + + init(); + } catch (e) { + Scholar.debug("processDocuments: " + e); + exception(e); + } +} + +// Appears to look for links in a document containing a certain substring +Scholar.Utilities.Ingester.prototype.collectURLsWithSubstring = function(doc, substring) { + var urls = []; + var addedURLs = []; + + var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); + var aElement = aElements.iterateNext(); + while (aElement) { + var href = aElement.href; + if (href.indexOf(substring) >= 0 && !(addedURLs[href])) { + urls.unshift(href); + addedURLs[href] = true; + } + aElement = aElements.iterateNext(); + } + return urls; +} + +// For now, we're going to skip the getLLsFromAddresses function (which gets +// latitude and longitude pairs from a series of addresses, but requires the +// big mess of Java code that is the Piggy Bank server) and the geoHelper +// tools (which rely on getLLsFromAddresses) since these are probably not +// essential components for Scholar and would take a great deal of effort to +// implement. We can, however, always implement them later. + +/* + * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS + */ + +/* + * Gets a given node (assumes only one value) + */ +Scholar.Utilities.Ingester.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { + return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); +} + +/* + * Gets a given node as a string containing all child nodes + */ +Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) { + var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver); + var returnVar = ""; + for(var i=0; i<elmts.length; i++) { + returnVar += elmts[i].nodeValue; + } + return returnVar; +} + +/* + * Allows a user to select which items to scrape + */ +Scholar.Utilities.Ingester.prototype.selectItems = function(itemList) { + // mozillazine made me do it! honest! + var io = { dataIn:itemList, dataOut:null } + var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul", + "_blank","chrome,modal,centerscreen,resizable=yes", io); + return io.dataOut; +} + +/* + * Grabs items based on URLs + */ +Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe, rejectRe) { + var availableItems = new Object(); // Technically, associative arrays are objects + + // Require link to match this + if(urlRe) { + var urlRegexp = new RegExp(); + urlRegexp.compile(urlRe, "i"); + } + // Do not allow text to match this + if(rejectRe) { + var rejectRegexp = new RegExp(); + rejectRegexp.compile(rejectRe, "i"); + } + + if(!inHere.length) { + inHere = new Array(inHere); + } + + for(var j=0; j<inHere.length; j++) { + var links = inHere[j].getElementsByTagName("a"); + for(var i=0; i<links.length; i++) { + if(!urlRe || urlRegexp.test(links[i].href)) { + var text = this.getNodeString(doc, links[i], './/text()', null); + if(text) { + text = this.cleanString(text); + if(!rejectRe || !rejectRegexp.test(text)) { + if(availableItems[links[i].href]) { + if(text != availableItems[links[i].href]) { + availableItems[links[i].href] += " "+text; + } + } else { + availableItems[links[i].href] = text; + } + } + } + } + } + } + + return availableItems; +} + +/* + * END SCHOLAR FOR FIREFOX EXTENSIONS + */ + +// These are front ends for XMLHttpRequest. XMLHttpRequest can't actually be +// accessed outside the sandbox, and even if it could, it wouldn't let scripts +// access across domains, so everything's replicated here. +Scholar.Utilities.HTTP = function(contentWindow, proxiedURL) { + this.window = contentWindow; + this.proxiedURL = proxiedURL; +} + +Scholar.Utilities.HTTP.prototype.doGet = function(url, onStatus, onDone) { + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + + var xmlhttp = new this.window.XMLHttpRequest(); + + xmlhttp.open('GET', url, true); + xmlhttp.overrideMimeType("text/plain"); + + var me = this; + xmlhttp.onreadystatechange = function() { + me.stateChange(xmlhttp, onStatus, onDone); + }; + xmlhttp.send(null); +} + +Scholar.Utilities.HTTP.prototype.doPost = function(url, body, onStatus, onDone) { + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + + var xmlhttp = new this.window.XMLHttpRequest(); + + xmlhttp.open('POST', url, true); + xmlhttp.overrideMimeType("text/plain"); + + var me = this; + xmlhttp.onreadystatechange = function() { + me.stateChange(xmlhttp, onStatus, onDone); + }; + xmlhttp.send(body); +} + +Scholar.Utilities.HTTP.prototype.doOptions = function(url, body, onStatus, onDone) { + if(this.proxiedURL) { + url = Scholar.Ingester.ProxyMonitor.properToProxy(url); + } + + var xmlhttp = new this.window.XMLHttpRequest(); + + xmlhttp.open('OPTIONS', url, true); + xmlhttp.overrideMimeType("text/plain"); + + var me = this; + xmlhttp.onreadystatechange = function() { + me.stateChange(xmlhttp, onStatus, onDone); + }; + xmlhttp.send(body); +} + +Scholar.Utilities.HTTP.prototype.stateChange = function(xmlhttp, onStatus, onDone) { + switch (xmlhttp.readyState) { + + // Request not yet made + case 1: + break; + + // Contact established with server but nothing downloaded yet + case 2: + try { + // Check for HTTP status 200 + if (xmlhttp.status != 200) { + if (onStatus) { + onStatus( + xmlhttp.status, + xmlhttp.statusText, + xmlhttp + ); + xmlhttp.abort(); + } + } + } catch (e) { + Scholar.debug(e, 2); + } + break; + + // Called multiple while downloading in progress + case 3: + break; + + // Download complete + case 4: + try { + if (onDone) { + onDone(xmlhttp.responseText, xmlhttp); + } + } catch (e) { + Scholar.debug(e, 2); + } + break; + } +} +\ No newline at end of file diff --git a/components/chnmIScholarService.js b/components/chnmIScholarService.js @@ -45,6 +45,10 @@ Cc["@mozilla.org/moz/jssubscript-loader;1"] Cc["@mozilla.org/moz/jssubscript-loader;1"] .getService(Ci.mozIJSSubScriptLoader) .loadSubScript("chrome://scholar/content/xpcom/marc.js"); + +Cc["@mozilla.org/moz/jssubscript-loader;1"] + .getService(Ci.mozIJSSubScriptLoader) + .loadSubScript("chrome://scholar/content/xpcom/utilities.js"); /********************************************************************/