www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 3d881eec13524c58c51389d8fe9956f40a92ed96
parent 953b1f9d209c8fcefc9a60adf165b46b2146ce90
Author: Simon Kornblith <simon@simonster.com>
Date:   Sat, 17 Jun 2006 21:21:15 +0000

- Make scrapers return standard ISO-style YYYY-MM-DD dates. Still need to work on journal article scrapers.
- Ingester lets callback function save items, rather than saving them itself.
- Better handling of multiple items in API, although no scrapers currently implement this.



Diffstat:
Mchrome/chromeFiles/content/scholar/ingester/browser.js | 21++++++++++++++-------
Mchrome/chromeFiles/content/scholar/xpcom/ingester.js | 204++++++++++++++++++++++++++++++++++++++++---------------------------------------
Mscrapers.sql | 30++++++++++++++++++++++++++++--
3 files changed, 145 insertions(+), 110 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -211,18 +211,20 @@ Scholar.Ingester.Interface._deleteDocument = function(browser) { /* * Callback to be executed when scraping is complete */ -Scholar.Ingester.Interface._finishScraping = function(documentObject) { - if(documentObject.item) { +Scholar.Ingester.Interface._finishScraping = function(obj) { + if(obj.items.length) { + var item1 = obj.items[0]; + Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete")); - var fields = Scholar.ItemFields.getItemTypeFields(documentObject.item.getField("itemTypeID")); + var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID")); var titleLabel = Scholar.getString("itemFields.title") + ":" - Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, this.item.getField("title")); - var creators = documentObject.item.numCreators(); + Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, item1.getField("title")); + var creators = item1.numCreators(); if(creators) { for(var i=0; i<creators; i++) { - var creator = documentObject.item.getCreator(i); + var creator = item1.getCreator(i); var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":"; var data = creator.firstName + ' ' + creator.lastName; Scholar.Ingester.Interface.scrapeProgress.addResult(label, data); @@ -230,7 +232,7 @@ Scholar.Ingester.Interface._finishScraping = function(documentObject) { } for(i in fields) { - var data = documentObject.item.getField(fields[i]); + var data = item1.getField(fields[i]); if(data) { var name = Scholar.ItemFields.getName(fields[i]); if(name != "source") { @@ -239,6 +241,11 @@ Scholar.Ingester.Interface._finishScraping = function(documentObject) { } } } + + // Save items + for(i in obj.items) { + obj.items[i].save(); + } } else { Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError")); Scholar.Ingester.Interface.scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription")); diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -49,7 +49,7 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {} // Scholar.Ingester.Utilities class, a set of methods to assist in data // extraction. Most code here was stolen directly from the Piggy Bank project. Scholar.Ingester.Utilities = function(hiddenBrowser) { - this.hiddenBrowser = hiddenBrowser; + this._hiddenBrowser = hiddenBrowser; } // Adapter for Piggy Bank function to print debug messages; log level is @@ -115,7 +115,7 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe // exception - a function to execute if an exception occurs (exceptions are // also logged in the Firefox Scholar log) Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { - var hiddenBrowser = this.hiddenBrowser; + var hiddenBrowser = this._hiddenBrowser; Scholar.debug("processDocuments called"); try { @@ -301,11 +301,13 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo // Extract title model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString); // Extract edition - model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'edition', this._MARCCleanString); + model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString); // Extract place info model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a'); // Extract publisher info model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b'); + // Extract year + model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCCleanString, '', 'c'); // Extract series model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString); } @@ -411,9 +413,13 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu * browser - browser window object of document * model - data model for semantic scrapers * scraper - best scraper to use to scrape page + * items - items returned after page is scraped * * Private properties: * _sandbox - sandbox for code execution + * _appSvc - AppShellService instance + * _hiddenBrowser - hiden browser object + * _scrapeCallback - callback function to be executed when scraping is complete */ ////////////////////////////////////////////////////////////////////////////// @@ -426,12 +432,13 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu * Constructor for Document object */ Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){ + this.scraper = null; this.browser = browserWindow; this.model = new Scholar.Ingester.Model(); - this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"] + this.items = new Array(); + this._appSvc = Cc["@mozilla.org/appshell/appShellService;1"] .getService(Ci.nsIAppShellService); - this.scraper = null; - this.hiddenBrowser = hiddenBrowser; + this._hiddenBrowser = hiddenBrowser; this._generateSandbox(); } @@ -474,7 +481,7 @@ Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) { if((!currentScraper.urlPattern || canScrape) && currentScraper.scraperDetectCode) { Scholar.debug("Checking scraperDetectCode"); - var scraperSandbox = this.sandbox; + var scraperSandbox = this._sandbox; try { canScrape = Components.utils.evalInSandbox("(function(){\n" + currentScraper.scraperDetectCode + @@ -498,7 +505,7 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) { Scholar.debug("Scraping "+this.browser.contentDocument.location.href); - var scraperSandbox = this.sandbox; + var scraperSandbox = this._sandbox; try { Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox); } catch(e) { @@ -550,20 +557,20 @@ Scholar.Ingester.Document.prototype._scrapePageComplete = function() { * Generates a sandbox for scraping/scraper detection */ Scholar.Ingester.Document.prototype._generateSandbox = function() { - this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href); - this.sandbox.browser = this.browser; - this.sandbox.doc = this.sandbox.browser.contentDocument; - this.sandbox.utilities = new Scholar.Ingester.Utilities(this.hiddenBrowser); - this.sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this.appSvc.hiddenDOMWindow); - this.sandbox.window = this.window; - this.sandbox.model = this.model; - this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult; - this.sandbox.MARC_Record = Scholar.Ingester.MARC_Record; - this.sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record(); + this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href); + this._sandbox.browser = this.browser; + this._sandbox.doc = this._sandbox.browser.contentDocument; + this._sandbox.utilities = new Scholar.Ingester.Utilities(this._hiddenBrowser); + this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow); + this._sandbox.window = this.window; + this._sandbox.model = this.model; + this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult; + this._sandbox.MARC_Record = Scholar.Ingester.MARC_Record; + this._sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record(); var me = this; - this.sandbox.wait = function(){ me._waitForCompletion = true; }; - this.sandbox.done = function(){ me._scrapePageComplete(); }; + this._sandbox.wait = function(){ me._waitForCompletion = true; }; + this._sandbox.done = function(){ me._scrapePageComplete(); }; } /* @@ -571,103 +578,98 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() { * (Ontologies are hard-coded until we have a real way of dealing with them) */ Scholar.Ingester.Document.prototype._updateDatabase = function() { + Scholar.debug("doing updating"); + var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'; var prefixDC = 'http://purl.org/dc/elements/1.1/'; var prefixDCMI = 'http://purl.org/dc/dcmitype/'; var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/'; - for(var uri in this.model.data) { - if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) { - var newItem = Scholar.Items.getNewItemByType(2); - } else { - var newItem = Scholar.Items.getNewItemByType(1); - } - newItem.setField("source", uri); - if(this.model.data[uri][prefixDC + 'title']) { - newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]); - } - var creatorIndex = 0; - if(this.model.data[uri][prefixDC + 'creator']) { - for(i in this.model.data[uri][prefixDC + 'creator']) { - var creator = this.model.data[uri][prefixDC + 'creator'][i]; - var spaceIndex = creator.lastIndexOf(" "); - var lastName = creator.substring(spaceIndex+1, creator.length); - var firstName = creator.substring(0, spaceIndex); - - newItem.setCreator(creatorIndex, firstName, lastName, 1); - creatorIndex++; - } - } - if(this.model.data[uri][prefixDC + 'contributor']) { - for(i in this.model.data[uri][prefixDC + 'contributor']) { - var creator = this.model.data[uri][prefixDC + 'contributor'][i]; - var spaceIndex = creator.lastIndexOf(" "); - var lastName = creator.substring(spaceIndex+1, creator.length); - var firstName = creator.substring(0, spaceIndex); - - newItem.setCreator(creatorIndex, firstName, lastName, 2); - creatorIndex++; - } - } - if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) { - if(this.model.data[uri][prefixDummy + 'publication']) { - newItem.setField("publication", this.model.data[uri][prefixDummy + 'publication'][0]); + try { + for(var uri in this.model.data) { + if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) { + var newItem = Scholar.Items.getNewItemByType(2); + } else { + var newItem = Scholar.Items.getNewItemByType(1); } - if(this.model.data[uri][prefixDummy + 'volume']) { - newItem.setField("volume", this.model.data[uri][prefixDummy + 'volume'][0]); + newItem.setField("source", uri); + if(this.model.data[uri][prefixDC + 'title']) { + newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]); } - if(this.model.data[uri][prefixDummy + 'number']) { - newItem.setField("number", this.model.data[uri][prefixDummy + 'number'][0]); + var creatorIndex = 0; + if(this.model.data[uri][prefixDC + 'creator']) { + for(i in this.model.data[uri][prefixDC + 'creator']) { + var creator = this.model.data[uri][prefixDC + 'creator'][i]; + var spaceIndex = creator.lastIndexOf(" "); + var lastName = creator.substring(spaceIndex+1, creator.length); + var firstName = creator.substring(0, spaceIndex); + + newItem.setCreator(creatorIndex, firstName, lastName, 1); + creatorIndex++; + } } - if(this.model.data[uri][prefixDummy + 'pages']) { - newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages'][0]); + if(this.model.data[uri][prefixDC + 'contributor']) { + for(i in this.model.data[uri][prefixDC + 'contributor']) { + var creator = this.model.data[uri][prefixDC + 'contributor'][i]; + var spaceIndex = creator.lastIndexOf(" "); + var lastName = creator.substring(spaceIndex+1, creator.length); + var firstName = creator.substring(0, spaceIndex); + + newItem.setCreator(creatorIndex, firstName, lastName, 2); + creatorIndex++; + } } - if(this.model.data[uri][prefixDC + 'identifier']) { - for(i in this.model.data[uri][prefixDC + 'identifier']) { - if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISSN') { - newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); - break; + if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) { + if(this.model.data[uri][prefixDummy + 'publication']) { + newItem.setField("publication", this.model.data[uri][prefixDummy + 'publication'][0]); + } + if(this.model.data[uri][prefixDummy + 'volume']) { + newItem.setField("volume", this.model.data[uri][prefixDummy + 'volume'][0]); + } + if(this.model.data[uri][prefixDummy + 'number']) { + newItem.setField("number", this.model.data[uri][prefixDummy + 'number'][0]); + } + if(this.model.data[uri][prefixDummy + 'pages']) { + newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages'][0]); + } + if(this.model.data[uri][prefixDC + 'identifier']) { + for(i in this.model.data[uri][prefixDC + 'identifier']) { + if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISSN') { + newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); + break; + } } } - } - } else { - if(this.model.data[uri][prefixDC + 'publisher']) { - newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]); - } - if(this.model.data[uri][prefixDC + 'year']) { - if(this.model.data[uri][prefixDC + 'year'].length == 4) { + } else { + if(this.model.data[uri][prefixDC + 'publisher']) { + newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]); + } + if(this.model.data[uri][prefixDC + 'year']) { newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]); - } else { - try { - newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring( - this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1, - this.model.data[uri][prefixDC + 'year'][0].length)); - } catch(e) {} + } else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) { + newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4)); } - } - if(this.model.data[uri][prefixDC + 'edition']) { - newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]); - } - if(this.model.data[uri][prefixDummy + 'series']) { - newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]); - } - if(this.model.data[uri][prefixDummy + 'place']) { - newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]); - } - if(this.model.data[uri][prefixDC + 'identifier']) { - for(i in this.model.data[uri][prefixDC + 'identifier']) { - if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') { - newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); - break; + if(this.model.data[uri][prefixDC + 'hasVersion']) { + newItem.setField("edition", this.model.data[uri][prefixDC + 'hasVersion'][0]); + } + if(this.model.data[uri][prefixDummy + 'series']) { + newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]); + } + if(this.model.data[uri][prefixDummy + 'place']) { + newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]); + } + if(this.model.data[uri][prefixDC + 'identifier']) { + for(i in this.model.data[uri][prefixDC + 'identifier']) { + if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') { + newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5)); + break; + } } } } + this.items.push(newItem); } - newItem.save(); - - // First one is stored so as to be accessible - if(!this.item) { - this.item = newItem; - } + } catch(ex) { + Scholar.debug('Error in Scholar.Ingester.Document._updateDatabase: '+ex); } } \ No newline at end of file diff --git a/scrapers.sql b/scrapers.sql @@ -22,6 +22,30 @@ var cleanString = function(s) { return s.replace(/ +/g, " "); } +var dateToISO = function(jsDate) { + var date = ""; + var year = jsDate.getFullYear().toString(); + var month = (jsDate.getMonth()+1).toString(); + var day = jsDate.getDate().toString(); + + for(var i = year.length; i<4; i++) { + date += "0"; + } + date += year+"-"; + + if(month.length == 1) { + date += "0"; + } + date += month+"-"; + + if(day.length == 1) { + date += "0"; + } + date += day; + + return date; +} + var uri = doc.location.href; model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); @@ -43,10 +67,12 @@ for (var i = 0; i < elmts.length; i++) { var attribute = cleanString(getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); if(getNode(doc, elmt, ''./text()[1]'', nsResolver)) { var value = cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); - if(attribute == "Publisher:") { if(value.lastIndexOf("(") != -1) { - var date = value.substring(value.lastIndexOf("(")+1, value.length-1); + var jsDate = value.substring(value.lastIndexOf("(")+1, value.length-1); + jsDate = new Date(jsDate); + var date = dateToISO(jsDate); + value = value.substring(0, value.lastIndexOf("(")-1); } if(value.lastIndexOf(";") != -1) {