commit 098078627c3e3f3ae93260445561318f5cb2cf4e
parent b4d65420f39e2c1390c626fbaa07575ceddc0186
Author: Simon Kornblith <simon@simonster.com>
Date: Fri, 23 Jun 2006 03:02:30 +0000
- Make events listening for DOMContentLoaded listen for load, because DOMContentLoaded does not seem ready for prime time (hey, it's undocumented, what can you expect)
- Make Amazon scraper work with multiple documents
- Fix bugs in processDocuments
- Make Scholar.Ingester.Utilities.getItemArray() willing to take an array of DOM nodes to search for links, and finally take advantage of the fact that objects have no length
Diffstat:
3 files changed, 145 insertions(+), 82 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -41,8 +41,8 @@ Scholar_Ingester_Interface.chromeLoad = function() {
// this gives us onLocationChange
Scholar_Ingester_Interface.tabBrowser.addProgressListener(Scholar_Ingester_Interface.Listener,
Components.interfaces.nsIWebProgress.NOTIFY_LOCATION);
- // this gives us DOMContentLoaded
- Scholar_Ingester_Interface.appContent.addEventListener("DOMContentLoaded",
+ // let's use load instead of DOMContentLoaded
+ Scholar_Ingester_Interface.appContent.addEventListener("load",
Scholar_Ingester_Interface.contentLoad, true);
}
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -131,11 +131,13 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
// also logged in the Firefox Scholar log)
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(this.window);
+ var myWindow = this.window;
+ var prevUrl, url;
Scholar.debug("processDocuments called");
try {
if (urls.length == 0) {
- if (firstDoc) {
+ if(firstDoc) {
processor(firstDoc, done);
} else {
done();
@@ -148,7 +150,7 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
urlIndex++;
if (urlIndex < urls.length) {
try {
- var url = urls[urlIndex];
+ url = urls[urlIndex];
Scholar.debug("loading "+url);
hiddenBrowser.loadURI(url);
} catch (e) {
@@ -156,23 +158,26 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
exception(e);
}
} else {
+ hiddenBrowser.removeEventListener("load", onLoad, true);
Scholar.Ingester.deleteHiddenBrowser(hiddenBrowser);
- hiddenBrowser.setTimeout(done, 10);
+ done();
}
};
var onLoad = function() {
- Scholar.debug("onLoad called");
- hiddenBrowser.removeEventListener("load", onLoad, true);
- try {
- var newHiddenBrowser = new Object();
- newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
- newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
- processor(newHiddenBrowser);
- } catch (e) {
- Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
- exception(e);
+ Scholar.debug(hiddenBrowser.contentDocument.location.href+" has been loaded");
+ if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times
+ prevUrl = hiddenBrowser.contentDocument.location.href;
+ try {
+ var newHiddenBrowser = new Object();
+ newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
+ newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
+ processor(newHiddenBrowser);
+ } catch (e) {
+ Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
+ exception(e);
+ }
+ doLoad();
}
- doLoad();
};
var init = function() {
Scholar.debug("init called");
@@ -332,23 +337,33 @@ Scholar.Ingester.Utilities.prototype.getItemArray = function(doc, inHere, urlRe,
var availableItems = new Object(); // Technically, associative arrays are objects
// Require link to match this
- var tagRegexp = new RegExp();
- tagRegexp.compile(urlRe);
+ if(urlRe) {
+ var urlRegexp = new RegExp();
+ urlRegexp.compile(urlRe);
+ }
// Do not allow text to match this
- var rejectRegexp = new RegExp();
- rejectRegexp.compile(rejectRe);
+ if(rejectRe) {
+ var rejectRegexp = new RegExp();
+ rejectRegexp.compile(rejectRe);
+ }
- var links = inHere.getElementsByTagName("a");
- for(var i=0; i<links.length; i++) {
- if(tagRegexp.test(links[i].href)) {
- var text = this.getNodeString(doc, links[i], './/text()', null);
- if(text) {
- text = this.cleanString(text);
- if(!rejectRegexp.test(text)) {
- if(availableItems[links[i].href]) {
- availableItems[links[i].href] += " "+text;
- } else {
- availableItems[links[i].href] = text;
+ if(!inHere.length) {
+ inHere = new Array(inHere);
+ }
+
+ for(var j=0; j<inHere.length; j++) {
+ var links = inHere[j].getElementsByTagName("a");
+ for(var i=0; i<links.length; i++) {
+ if(!urlRe || urlRegexp.test(links[i].href)) {
+ var text = this.getNodeString(doc, links[i], './/text()', null);
+ if(text) {
+ text = this.cleanString(text);
+ if(!rejectRe || !rejectRegexp.test(text)) {
+ if(availableItems[links[i].href]) {
+ availableItems[links[i].href] += " "+text;
+ } else {
+ availableItems[links[i].href] = text;
+ }
}
}
}
@@ -822,7 +837,16 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
if(this.model.data[uri][prefixDC + 'year']) {
newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
} else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) {
- newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4));
+ var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/
+ if(ISORe.test(this.model.data[uri][prefixDC + 'date'][0])) {
+ newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4));
+ } else {
+ var m;
+ var yearRe = /[0-9]{4}$/;
+ if(m = yearRe.exec(this.model.data[uri][prefixDC + 'date'][0])) {
+ newItem.setField("year", m[0]);
+ }
+ }
}
}
diff --git a/scrapers.sql b/scrapers.sql
@@ -1,9 +1,9 @@
--- 10
+-- 11
-- Set the following timestamp to the most recent scraper update date
-REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 16:51:00'));
+REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-22 22:58:00'));
-REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-18 10:15:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
@@ -13,59 +13,98 @@ var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
-var uri = doc.location.href;
-
-// Retrieve authors
-var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
-var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
-for (var i = 0; i < elmts.length; i++) {
- var elmt = elmts[i];
+function scrape(doc) {
+ uri = doc.location.href;
- model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
-}
-
-// Retrieve data from "Product Details" box
-var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
-var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
-for (var i = 0; i < elmts.length; i++) {
- var elmt = elmts[i];
- var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
- if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
- var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
- if(attribute == "Publisher:") {
- if(value.lastIndexOf("(") != -1) {
- var jsDate = value.substring(value.lastIndexOf("(")+1, value.length-1);
- jsDate = new Date(jsDate);
- var date = utilities.dateToISO(jsDate);
-
- value = value.substring(0, value.lastIndexOf("(")-1);
- }
- if(value.lastIndexOf(";") != -1) {
- var edition = value.substring(value.lastIndexOf(";")+2, value.length);
- value = value.substring(0, value.lastIndexOf(";"));
+ // Retrieve authors
+ var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
+ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+ for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+
+ model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
+ }
+
+ // Retrieve data from "Product Details" box
+ var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
+ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+ for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
+ if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
+ var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
+ if(attribute == "Publisher:") {
+ if(value.lastIndexOf("(") != -1) {
+ var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
+ jsDate = new Date(date);
+ if(!isNaN(jsDate.valueOf())) {
+ date = utilities.dateToISO(jsDate);
+ }
+
+ value = value.substring(0, value.lastIndexOf("(")-1);
+ }
+ if(value.lastIndexOf(";") != -1) {
+ var edition = value.substring(value.lastIndexOf(";")+2, value.length);
+ value = value.substring(0, value.lastIndexOf(";"));
+ }
+ model.addStatement(uri, prefixDC + ''publisher'', value);
+ model.addStatement(uri, prefixDC + ''date'', date);
+ model.addStatement(uri, prefixDC + ''hasVersion'', edition);
+ } else if(attribute == "Language:") {
+ model.addStatement(uri, prefixDC + ''language'', value);
+ } else if(attribute == "ISBN:") {
+ model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
+ } else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
+ model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" ")));
+ model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));
}
- model.addStatement(uri, prefixDC + ''publisher'', value);
- model.addStatement(uri, prefixDC + ''date'', date);
- model.addStatement(uri, prefixDC + ''hasVersion'', edition);
- } else if(attribute == "Language:") {
- model.addStatement(uri, prefixDC + ''language'', value);
- } else if(attribute == "ISBN:") {
- model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
- } else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
- model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" ")));
- model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));
}
}
+
+ var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
+ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+ var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
+ if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
+ title = title.substring(0, title.lastIndexOf("(")-1);
+ }
+ model.addStatement(uri, prefixDC + ''title'', title);
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
}
-var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
-var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
-var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
-if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
- title = title.substring(0, title.lastIndexOf("(")-1);
-}
-model.addStatement(uri, prefixDC + ''title'', title);
-model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);');
+var searchRe = new RegExp(''http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/)'');
+var m = searchRe.exec(doc.location.href)
+if(m) {
+ // Why can''t amazon use standard stylesheets
+ var xpath;
+ if(m == "gp/search/") {
+ xpath = ''//table[@class="searchresults"]'';
+ } else {
+ xpath = ''//table[@cellpadding="3"]'';
+ }
+
+ var searchresults = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+ var items = utilities.getItemArray(doc, searchresults, ''http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$'');
+ items = utilities.selectItems(items);
+
+ if(!items) {
+ return true;
+ }
+
+ var uris = new Array();
+ for(i in items) {
+ uris.push(i);
+ }
+
+ utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) },
+ function() {
+ utilities.debugPrint("look, done");
+ done();
+ }, function() {});
+
+ wait();
+} else {
+ scrape(doc);
+}');
REPLACE INTO "scrapers" VALUES('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-18 11:02:00', 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/',
'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {