www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 7d3deb5b9ffda685851ea2210666a34d25f88723
parent 6c89acbe0de8d48521b19083c0e97849753b8ffc
Author: Simon Kornblith <simon@simonster.com>
Date:   Wed, 21 Jun 2006 01:41:07 +0000

- Make Scholar.Ingester.Utilities.loadDocument() attach an event handler to load rather than DOMContentLoaded to resolve an issue with the Ex Libris/Aleph scraper (VCU)
- When possible, corporate creators/contributors are categorized with their own RDF types (prefixDummy + "corporateCreator/corporateContributor)
- Remove extraneous debug code in extensions


Diffstat:
Mchrome/chromeFiles/content/scholar/xpcom/ingester.js | 41++++++++++++++++++++++++++++++++++++++---
Mscrapers.sql | 35+++++++----------------------------
2 files changed, 45 insertions(+), 31 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -164,7 +164,7 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD }; var init = function() { Scholar.debug("init called"); - hiddenBrowser.addEventListener("DOMContentLoaded", onLoad, true); + hiddenBrowser.addEventListener("load", onLoad, true); if (firstDoc) { Scholar.debug("processing"); @@ -213,6 +213,10 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su * Piggy Bank. When used in external code, the repository will need to add * a function definition when exporting in Piggy Bank format. */ + +/* + * Converts a JavaScript date object to an ISO-style date + */ Scholar.Ingester.Utilities.prototype.dateToISO = function(jsDate) { var date = ""; var year = jsDate.getFullYear().toString(); @@ -237,10 +241,28 @@ Scholar.Ingester.Utilities.prototype.dateToISO = function(jsDate) { return date; } +/* + * Gets a given node (assumes only one value) + */ Scholar.Ingester.Utilities.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); } +/* + * Gets a given node as a string containing all child nodes + */ +Scholar.Ingester.Utilities.prototype.getNodeString = function(doc, contextNode, xpath, nsResolver) { + var elmts = this.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver); + var returnVar = ""; + for(var i=0; i<elmts.length; i++) { + returnVar += elmts[i].nodeValue; + } + return returnVar; +} + +/* + * Cleans extraneous punctuation off an author name + */ Scholar.Ingester.Utilities.prototype.cleanAuthor = function(author) { author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); @@ -256,16 +278,25 @@ Scholar.Ingester.Utilities.prototype.cleanAuthor = function(author) { return author; } +/* + * Cleans whitespace off a string and replaces multiple spaces with one + */ Scholar.Ingester.Utilities.prototype.cleanString = function(s) { s = this.trimString(s); return s.replace(/ +/g, " "); } +/* + * Cleans any non-world non-parenthesis characters off the ends of a string + */ Scholar.Ingester.Utilities.prototype.superCleanString = function(x) { var x = x.replace(/^[^\w(]+/, ""); return x.replace(/[^\w)]+$/, ""); } +/* + * Eliminates HTML tags, replacing <br>s with /ns + */ Scholar.Ingester.Utilities.prototype.cleanTags = function(x) { x = x.replace(/<br[^>]*>/gi, "\n"); return x.replace(/<[^>]+>/g, ""); @@ -555,6 +586,8 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) { Scholar.debug("Scraping "+this.browser.contentDocument.location.href); + Scholar.debug(this.scraper.scraperJavaScript); + var scraperSandbox = this._sandbox; try { Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox); @@ -563,6 +596,8 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) { this._scrapePageComplete(); } + Scholar.debug("scraping complete"); + // If synchronous, call _scrapePageComplete(); if(!this._waitForCompletion) { this._scrapePageComplete(); @@ -694,13 +729,13 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() { } if(this.model.data[uri][prefixDummy + 'corporateCreator']) { for(i in this.model.data[uri][prefixDummy + 'corporateCreator']) { - newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateCreator'][i], null, 1); + newItem.setCreator(creatorIndex, null, this.model.data[uri][prefixDummy + 'corporateCreator'][i], 1); creatorIndex++; } } if(this.model.data[uri][prefixDummy + 'corporateContributor']) { for(i in this.model.data[uri][prefixDummy + 'corporateContributor']) { - newItem.setCreator(creatorIndex, this.model.data[uri][prefixDummy + 'corporateContributor'][i], null, 2); + newItem.setCreator(creatorIndex, null, this.model.data[uri][prefixDummy + 'corporateContributor'][i], 2); creatorIndex++; } } diff --git a/scrapers.sql b/scrapers.sql @@ -247,7 +247,6 @@ if(!elmts.length) { var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]''; var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); } -utilities.debugPrint(elmts.length); if(elmts && elmts.length) { return true; } @@ -333,7 +332,6 @@ utilities.HTTPUtilities.doPost(''http://www.jstor.org/browse'', postData, null, data[prefixDummy + "series"].push(fieldContent); } else if(fieldCode == "DA") { var date = new Date(fieldContent.replace(".", "")); - utilities.debugPrint(date.valueOf()); if(isNaN(date.valueOf())) { data[prefixDC + "date"].push(fieldContent); } else { @@ -540,7 +538,7 @@ for (var i = 0; i < elmts.length; i++) { rdfUri = prefixDC + ''contributor''; value = utilities.cleanAuthor(node.nodeValue); } else if(field == "corporate author") { - rdfUri = prefixDC + ''creator''; + rdfUri = prefixDummy + ''corporateCreator''; } if(rdfUri) { var insert = true; @@ -807,7 +805,6 @@ if(m) { var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/; var m = bylineRegexp.exec(citationData); if(m) { - utilities.debugPrint(m[1].substring(0, 3).toLowerCase()); if(m[1].substring(0, 3).toLowerCase() == "by ") { m[1] = m[1].substring(3); } @@ -835,7 +832,6 @@ var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; var newUri = uri.replace("&format=999", "&format=001"); -utilities.debugPrint(newUri); utilities.loadDocument(newUri, browser, function(newBrowser) { newDoc = newBrowser.contentDocument; @@ -847,11 +843,11 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { var xpath = ''/html/body/table/tbody/tr[td[1][@class="td1"][@id="bold"]][td[2][@class="td1"]]''; var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - var record = new MARC_Record(); + var record = new MARC_Record(); for(var i=0; i<elmts.length; i++) { var elmt = elmts[i]; var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue); - var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver).nodeValue; + var value = utilities.getNodeString(doc, elmt, ''./TD[2]//text()'', nsResolver); var value = value.replace(/\|([a-z]) /g, record.subfield_delimiter+"$1"); if(field != "FMT" && field != "LDR") { @@ -868,9 +864,10 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { } } + model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); model = utilities.importMARCRecord(record, uri, model); done(); -}, function() {}) +}, function() {}); wait();'); @@ -882,16 +879,6 @@ var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; var newUri = uri+''&fullmarc=true''; -utilities.debugPrint(newUri); - -var utilities.getNodeString = function(doc, contextNode, xpath, nsResolver) { - var elmts = utilities.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver); - var returnVar = ""; - for(var i=0; i<elmts.length; i++) { - returnVar += elmts[i].nodeValue; - } - return returnVar; -} utilities.loadDocument(newUri, browser, function(newBrowser) { newDoc = newBrowser.contentDocument; @@ -906,8 +893,8 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { var record = new MARC_Record(); for(var i=0; i<elmts.length; i++) { var elmt = elmts[i]; - var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver).nodeValue); - var value = utilities.getNodeString(doc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver); + var field = utilities.superCleanString(utilities.getNode(newDoc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver).nodeValue); + var value = utilities.getNodeString(newDoc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver); value = value.replace(/\$([a-z]) /g, record.subfield_delimiter+"$1"); if(field != "FMT" && field != "LDR") { @@ -940,7 +927,6 @@ var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; var newUri = uri.replace(/function=[A-Z]{7}/, "function=MARCSCR"); -utilities.debugPrint(newUri); utilities.loadDocument(newUri, browser, function(newBrowser) { newDoc = newBrowser.contentDocument; @@ -992,8 +978,6 @@ if(uri.indexOf("authority_hits") < 0) { var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc"; } -utilities.debugPrint(newUri); - utilities.HTTPUtilities.doGet(newUri, null, function(text) { var record = new MARC_Record(); record.load(text, "binary"); @@ -1070,7 +1054,6 @@ var nsResolver = namespace ? function(prefix) { var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p/text()[1]'', nsResolver); for(i in elmts) { - utilities.debugPrint(elmts[i].nodeValue); if(elmts[i].nodeValue == "\n\nViewing record\n") { return true; } @@ -1090,7 +1073,6 @@ var uri = doc.location.href; var uriRegexp = /^(.*)(\/[0-9]+)$/; var m = uriRegexp.exec(uri); var newUri = m[1]+"/40"; -utilities.debugPrint(newUri); var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver); for(i in elmts) { @@ -1154,7 +1136,6 @@ var nsResolver = namespace ? function(prefix) { var uri = doc.location.href; var newUri = uri.replace("LabelDisplay", "MARCDisplay"); -utilities.debugPrint(newUri); utilities.loadDocument(newUri, browser, function(newBrowser) { newDoc = newBrowser.contentDocument; @@ -1206,7 +1187,6 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { } record.add_field(tag, ind1, ind2, content); - utilities.debugPrint("tag:"+tag+" ind1:"+ind1+" ind2:"+ind2+" content:"+content); } model = utilities.importMARCRecord(record, uri, model); @@ -1304,7 +1284,6 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) { var xml = new XML(text); for(var i=0; i<xml.PubmedArticle.length(); i++) { - utilities.debugPrint("one article..."); var citation = xml.PubmedArticle[i].MedlineCitation; if(citation.PMID.length()) {