commit 3d881eec13524c58c51389d8fe9956f40a92ed96
parent 953b1f9d209c8fcefc9a60adf165b46b2146ce90
Author: Simon Kornblith <simon@simonster.com>
Date: Sat, 17 Jun 2006 21:21:15 +0000
- Make scrapers return standard ISO-style YYYY-MM-DD dates. Still need to work on journal article scrapers.
- Ingester lets callback function save items, rather than saving them itself.
- Better handling of multiple items in API, although no scrapers currently implement this.
Diffstat:
3 files changed, 145 insertions(+), 110 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -211,18 +211,20 @@ Scholar.Ingester.Interface._deleteDocument = function(browser) {
/*
* Callback to be executed when scraping is complete
*/
-Scholar.Ingester.Interface._finishScraping = function(documentObject) {
- if(documentObject.item) {
+Scholar.Ingester.Interface._finishScraping = function(obj) {
+ if(obj.items.length) {
+ var item1 = obj.items[0];
+
Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete"));
- var fields = Scholar.ItemFields.getItemTypeFields(documentObject.item.getField("itemTypeID"));
+ var fields = Scholar.ItemFields.getItemTypeFields(item1.getField("itemTypeID"));
var titleLabel = Scholar.getString("itemFields.title") + ":"
- Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, this.item.getField("title"));
- var creators = documentObject.item.numCreators();
+ Scholar.Ingester.Interface.scrapeProgress.addResult(titleLabel, item1.getField("title"));
+ var creators = item1.numCreators();
if(creators) {
for(var i=0; i<creators; i++) {
- var creator = documentObject.item.getCreator(i);
+ var creator = item1.getCreator(i);
var label = Scholar.getString("creatorTypes."+Scholar.CreatorTypes.getTypeName(creator.creatorTypeID)) + ":";
var data = creator.firstName + ' ' + creator.lastName;
Scholar.Ingester.Interface.scrapeProgress.addResult(label, data);
@@ -230,7 +232,7 @@ Scholar.Ingester.Interface._finishScraping = function(documentObject) {
}
for(i in fields) {
- var data = documentObject.item.getField(fields[i]);
+ var data = item1.getField(fields[i]);
if(data) {
var name = Scholar.ItemFields.getName(fields[i]);
if(name != "source") {
@@ -239,6 +241,11 @@ Scholar.Ingester.Interface._finishScraping = function(documentObject) {
}
}
}
+
+ // Save items
+ for(i in obj.items) {
+ obj.items[i].save();
+ }
} else {
Scholar.Ingester.Interface.scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError"));
Scholar.Ingester.Interface.scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription"));
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -49,7 +49,7 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
// Scholar.Ingester.Utilities class, a set of methods to assist in data
// extraction. Most code here was stolen directly from the Piggy Bank project.
Scholar.Ingester.Utilities = function(hiddenBrowser) {
- this.hiddenBrowser = hiddenBrowser;
+ this._hiddenBrowser = hiddenBrowser;
}
// Adapter for Piggy Bank function to print debug messages; log level is
@@ -115,7 +115,7 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
// exception - a function to execute if an exception occurs (exceptions are
// also logged in the Firefox Scholar log)
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
- var hiddenBrowser = this.hiddenBrowser;
+ var hiddenBrowser = this._hiddenBrowser;
Scholar.debug("processDocuments called");
try {
@@ -301,11 +301,13 @@ Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, mo
// Extract title
model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString);
// Extract edition
- model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'edition', this._MARCCleanString);
+ model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString);
// Extract place info
model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a');
// Extract publisher info
model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b');
+ // Extract year
+ model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCCleanString, '', 'c');
// Extract series
model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString);
}
@@ -411,9 +413,13 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
* browser - browser window object of document
* model - data model for semantic scrapers
* scraper - best scraper to use to scrape page
+ * items - items returned after page is scraped
*
* Private properties:
* _sandbox - sandbox for code execution
+ * _appSvc - AppShellService instance
+ * _hiddenBrowser - hiden browser object
+ * _scrapeCallback - callback function to be executed when scraping is complete
*/
//////////////////////////////////////////////////////////////////////////////
@@ -426,12 +432,13 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
* Constructor for Document object
*/
Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){
+ this.scraper = null;
this.browser = browserWindow;
this.model = new Scholar.Ingester.Model();
- this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
+ this.items = new Array();
+ this._appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
.getService(Ci.nsIAppShellService);
- this.scraper = null;
- this.hiddenBrowser = hiddenBrowser;
+ this._hiddenBrowser = hiddenBrowser;
this._generateSandbox();
}
@@ -474,7 +481,7 @@ Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) {
if((!currentScraper.urlPattern || canScrape)
&& currentScraper.scraperDetectCode) {
Scholar.debug("Checking scraperDetectCode");
- var scraperSandbox = this.sandbox;
+ var scraperSandbox = this._sandbox;
try {
canScrape = Components.utils.evalInSandbox("(function(){\n" +
currentScraper.scraperDetectCode +
@@ -498,7 +505,7 @@ Scholar.Ingester.Document.prototype.scrapePage = function(callback) {
Scholar.debug("Scraping "+this.browser.contentDocument.location.href);
- var scraperSandbox = this.sandbox;
+ var scraperSandbox = this._sandbox;
try {
Components.utils.evalInSandbox(this.scraper.scraperJavaScript, scraperSandbox);
} catch(e) {
@@ -550,20 +557,20 @@ Scholar.Ingester.Document.prototype._scrapePageComplete = function() {
* Generates a sandbox for scraping/scraper detection
*/
Scholar.Ingester.Document.prototype._generateSandbox = function() {
- this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
- this.sandbox.browser = this.browser;
- this.sandbox.doc = this.sandbox.browser.contentDocument;
- this.sandbox.utilities = new Scholar.Ingester.Utilities(this.hiddenBrowser);
- this.sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this.appSvc.hiddenDOMWindow);
- this.sandbox.window = this.window;
- this.sandbox.model = this.model;
- this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
- this.sandbox.MARC_Record = Scholar.Ingester.MARC_Record;
- this.sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record();
+ this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
+ this._sandbox.browser = this.browser;
+ this._sandbox.doc = this._sandbox.browser.contentDocument;
+ this._sandbox.utilities = new Scholar.Ingester.Utilities(this._hiddenBrowser);
+ this._sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this._appSvc.hiddenDOMWindow);
+ this._sandbox.window = this.window;
+ this._sandbox.model = this.model;
+ this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
+ this._sandbox.MARC_Record = Scholar.Ingester.MARC_Record;
+ this._sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record();
var me = this;
- this.sandbox.wait = function(){ me._waitForCompletion = true; };
- this.sandbox.done = function(){ me._scrapePageComplete(); };
+ this._sandbox.wait = function(){ me._waitForCompletion = true; };
+ this._sandbox.done = function(){ me._scrapePageComplete(); };
}
/*
@@ -571,103 +578,98 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
* (Ontologies are hard-coded until we have a real way of dealing with them)
*/
Scholar.Ingester.Document.prototype._updateDatabase = function() {
+ Scholar.debug("doing updating");
+
var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#';
var prefixDC = 'http://purl.org/dc/elements/1.1/';
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
- for(var uri in this.model.data) {
- if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
- var newItem = Scholar.Items.getNewItemByType(2);
- } else {
- var newItem = Scholar.Items.getNewItemByType(1);
- }
- newItem.setField("source", uri);
- if(this.model.data[uri][prefixDC + 'title']) {
- newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]);
- }
- var creatorIndex = 0;
- if(this.model.data[uri][prefixDC + 'creator']) {
- for(i in this.model.data[uri][prefixDC + 'creator']) {
- var creator = this.model.data[uri][prefixDC + 'creator'][i];
- var spaceIndex = creator.lastIndexOf(" ");
- var lastName = creator.substring(spaceIndex+1, creator.length);
- var firstName = creator.substring(0, spaceIndex);
-
- newItem.setCreator(creatorIndex, firstName, lastName, 1);
- creatorIndex++;
- }
- }
- if(this.model.data[uri][prefixDC + 'contributor']) {
- for(i in this.model.data[uri][prefixDC + 'contributor']) {
- var creator = this.model.data[uri][prefixDC + 'contributor'][i];
- var spaceIndex = creator.lastIndexOf(" ");
- var lastName = creator.substring(spaceIndex+1, creator.length);
- var firstName = creator.substring(0, spaceIndex);
-
- newItem.setCreator(creatorIndex, firstName, lastName, 2);
- creatorIndex++;
- }
- }
- if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
- if(this.model.data[uri][prefixDummy + 'publication']) {
- newItem.setField("publication", this.model.data[uri][prefixDummy + 'publication'][0]);
+ try {
+ for(var uri in this.model.data) {
+ if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
+ var newItem = Scholar.Items.getNewItemByType(2);
+ } else {
+ var newItem = Scholar.Items.getNewItemByType(1);
}
- if(this.model.data[uri][prefixDummy + 'volume']) {
- newItem.setField("volume", this.model.data[uri][prefixDummy + 'volume'][0]);
+ newItem.setField("source", uri);
+ if(this.model.data[uri][prefixDC + 'title']) {
+ newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]);
}
- if(this.model.data[uri][prefixDummy + 'number']) {
- newItem.setField("number", this.model.data[uri][prefixDummy + 'number'][0]);
+ var creatorIndex = 0;
+ if(this.model.data[uri][prefixDC + 'creator']) {
+ for(i in this.model.data[uri][prefixDC + 'creator']) {
+ var creator = this.model.data[uri][prefixDC + 'creator'][i];
+ var spaceIndex = creator.lastIndexOf(" ");
+ var lastName = creator.substring(spaceIndex+1, creator.length);
+ var firstName = creator.substring(0, spaceIndex);
+
+ newItem.setCreator(creatorIndex, firstName, lastName, 1);
+ creatorIndex++;
+ }
}
- if(this.model.data[uri][prefixDummy + 'pages']) {
- newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages'][0]);
+ if(this.model.data[uri][prefixDC + 'contributor']) {
+ for(i in this.model.data[uri][prefixDC + 'contributor']) {
+ var creator = this.model.data[uri][prefixDC + 'contributor'][i];
+ var spaceIndex = creator.lastIndexOf(" ");
+ var lastName = creator.substring(spaceIndex+1, creator.length);
+ var firstName = creator.substring(0, spaceIndex);
+
+ newItem.setCreator(creatorIndex, firstName, lastName, 2);
+ creatorIndex++;
+ }
}
- if(this.model.data[uri][prefixDC + 'identifier']) {
- for(i in this.model.data[uri][prefixDC + 'identifier']) {
- if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISSN') {
- newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
- break;
+ if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
+ if(this.model.data[uri][prefixDummy + 'publication']) {
+ newItem.setField("publication", this.model.data[uri][prefixDummy + 'publication'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'volume']) {
+ newItem.setField("volume", this.model.data[uri][prefixDummy + 'volume'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'number']) {
+ newItem.setField("number", this.model.data[uri][prefixDummy + 'number'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'pages']) {
+ newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages'][0]);
+ }
+ if(this.model.data[uri][prefixDC + 'identifier']) {
+ for(i in this.model.data[uri][prefixDC + 'identifier']) {
+ if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISSN') {
+ newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
+ break;
+ }
}
}
- }
- } else {
- if(this.model.data[uri][prefixDC + 'publisher']) {
- newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
- }
- if(this.model.data[uri][prefixDC + 'year']) {
- if(this.model.data[uri][prefixDC + 'year'].length == 4) {
+ } else {
+ if(this.model.data[uri][prefixDC + 'publisher']) {
+ newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
+ }
+ if(this.model.data[uri][prefixDC + 'year']) {
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
- } else {
- try {
- newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring(
- this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1,
- this.model.data[uri][prefixDC + 'year'][0].length));
- } catch(e) {}
+ } else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) {
+ newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4));
}
- }
- if(this.model.data[uri][prefixDC + 'edition']) {
- newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]);
- }
- if(this.model.data[uri][prefixDummy + 'series']) {
- newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
- }
- if(this.model.data[uri][prefixDummy + 'place']) {
- newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
- }
- if(this.model.data[uri][prefixDC + 'identifier']) {
- for(i in this.model.data[uri][prefixDC + 'identifier']) {
- if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
- newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
- break;
+ if(this.model.data[uri][prefixDC + 'hasVersion']) {
+ newItem.setField("edition", this.model.data[uri][prefixDC + 'hasVersion'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'series']) {
+ newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'place']) {
+ newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
+ }
+ if(this.model.data[uri][prefixDC + 'identifier']) {
+ for(i in this.model.data[uri][prefixDC + 'identifier']) {
+ if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
+ newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
+ break;
+ }
}
}
}
+ this.items.push(newItem);
}
- newItem.save();
-
- // First one is stored so as to be accessible
- if(!this.item) {
- this.item = newItem;
- }
+ } catch(ex) {
+ Scholar.debug('Error in Scholar.Ingester.Document._updateDatabase: '+ex);
}
}
\ No newline at end of file
diff --git a/scrapers.sql b/scrapers.sql
@@ -22,6 +22,30 @@ var cleanString = function(s) {
return s.replace(/ +/g, " ");
}
+var dateToISO = function(jsDate) {
+ var date = "";
+ var year = jsDate.getFullYear().toString();
+ var month = (jsDate.getMonth()+1).toString();
+ var day = jsDate.getDate().toString();
+
+ for(var i = year.length; i<4; i++) {
+ date += "0";
+ }
+ date += year+"-";
+
+ if(month.length == 1) {
+ date += "0";
+ }
+ date += month+"-";
+
+ if(day.length == 1) {
+ date += "0";
+ }
+ date += day;
+
+ return date;
+}
+
var uri = doc.location.href;
model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
@@ -43,10 +67,12 @@ for (var i = 0; i < elmts.length; i++) {
var attribute = cleanString(getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
if(getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
var value = cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
-
if(attribute == "Publisher:") {
if(value.lastIndexOf("(") != -1) {
- var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
+ var jsDate = value.substring(value.lastIndexOf("(")+1, value.length-1);
+ jsDate = new Date(jsDate);
+ var date = dateToISO(jsDate);
+
value = value.substring(0, value.lastIndexOf("(")-1);
}
if(value.lastIndexOf(";") != -1) {