commit 7b7d3d85e38aa77c4c47993ea03a451770afedda
parent b8ddba3a67383eeddf55f12a47186db102b43a69
Author: Simon Kornblith <simon@simonster.com>
Date: Fri, 8 Sep 2006 05:47:47 +0000
- added Washington Post translator
- translation works properly even when a user has switched to a different page
Diffstat:
4 files changed, 178 insertions(+), 69 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -167,8 +167,6 @@ Scholar_Ingester_Interface.tabClose = function(event) {
Scholar_Ingester_Interface.tabSelect = function(event) {
var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser);
Scholar_Ingester_Interface._updateStatus(data);
- // Make sure scrape progress is gone
- Scholar_Ingester_Interface.Progress.kill();
}
Scholar_Ingester_Interface.hidePopup = function(collectionID) {
diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js
@@ -558,14 +558,13 @@ Scholar.Translate.prototype._generateSandbox = function() {
this._sandbox.Scholar.nextItem = function() { return me._exportGetItem() };
this._sandbox.Scholar.nextCollection = function() { return me._exportGetCollection() }
} else {
- // add routines to add new items
- this._sandbox.Scholar.Item = Scholar.Translate.ScholarItem;
- // attach the function to be run when an item is done
+ // copy routines to add new items
+ this._sandbox.Scholar.Item = Scholar.Translate.GenerateScholarItemClass();
this._sandbox.Scholar.Item.prototype.complete = function() {me._itemDone(this)};
if(this.type == "import") {
// add routines to add new collections
- this._sandbox.Scholar.Collection = Scholar.Translate.ScholarCollection;
+ this._sandbox.Scholar.Collection = Scholar.Translate.GenerateScholarItemClass();
// attach the function to be run when a collection is done
this._sandbox.Scholar.Collection.prototype.complete = function() {me._collectionDone(this)};
}
@@ -882,7 +881,7 @@ Scholar.Translate.prototype._translationComplete = function(returnValue) {
Scholar.Notifier.trigger("add", "item", this.newItems);
}
// notify collectionTreeView about updates
- if(this.newCollections.length) {
+ if(this.newCollections && this.newCollections.length) {
Scholar.Notifier.trigger("add", "collection", this.newCollections);
}
}
@@ -1007,7 +1006,7 @@ Scholar.Translate.prototype._itemTagsAndSeeAlso = function(item, newItem) {
/*
* executed when an item is done and ready to be loaded into the database
*/
-Scholar.Translate.prototype._itemDone = function(item) {
+Scholar.Translate.prototype._itemDone = function(item) {
if(!this.saveItem) { // if we're not supposed to save the item, just
// return the item array
@@ -1056,7 +1055,7 @@ Scholar.Translate.prototype._itemDone = function(item) {
item.itemType = item.complete = undefined;
// automatically set access date if URL is set
- if(item.url && !item.accessDate) {
+ if(item.url && !item.accessDate && this.type == "web") {
item.accessDate = (new Date()).toLocaleString();
}
@@ -1778,26 +1777,34 @@ Scholar.Translate.prototype._storageFunctions = function(read, write) {
* inside scraper code
*/
-Scholar.Translate.ScholarItem = function(itemType) {
- // assign item type
- this.itemType = itemType;
- // generate creators array
- this.creators = new Array();
- // generate notes array
- this.notes = new Array();
- // generate tags array
- this.tags = new Array();
- // generate see also array
- this.seeAlso = new Array();
- // generate file array
- this.attachments = new Array();
+Scholar.Translate.GenerateScholarItemClass = function() {
+ var ScholarItem = function(itemType) {
+ // assign item type
+ this.itemType = itemType;
+ // generate creators array
+ this.creators = new Array();
+ // generate notes array
+ this.notes = new Array();
+ // generate tags array
+ this.tags = new Array();
+ // generate see also array
+ this.seeAlso = new Array();
+ // generate file array
+ this.attachments = new Array();
+ };
+
+ return ScholarItem;
}
/* Scholar.Translate.Collection: a class for generating a new top-level
* collection from inside scraper code
*/
-
-Scholar.Translate.ScholarCollection = function() {}
+
+Scholar.Translate.GenerateScholarCollectionClass = function() {
+ var ScholarCollection = Scholar.Translate.ScholarCollection = function() {};
+
+ return ScholarCollection;
+}
/* Scholar.Translate.RDF: a class for handling RDF IO
*
diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js
@@ -62,7 +62,7 @@ Scholar.Utilities.prototype.cleanString = function(s) {
throw "cleanString: argument must be a string";
}
- s = s.replace(/[ \xA0\r\n]+/g, " ");
+ s = s.replace(/[\xA0\r\n\s]+/g, " ");
s = s.replace(/^\s+/, "");
return s.replace(/\s+$/, "");
}
@@ -236,13 +236,21 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe,
// Require link to match this
if(urlRe) {
- var urlRegexp = new RegExp();
- urlRegexp.compile(urlRe, "i");
+ if(urlRe.exec) {
+ var urlRegexp = urlRe;
+ } else {
+ var urlRegexp = new RegExp();
+ urlRegexp.compile(urlRe, "i");
+ }
}
// Do not allow text to match this
if(rejectRe) {
- var rejectRegexp = new RegExp();
- rejectRegexp.compile(rejectRe, "i");
+ if(rejectRe.exec) {
+ var rejectRegexp = rejectRe;
+ } else {
+ var rejectRegexp = new RegExp();
+ rejectRegexp.compile(rejectRe, "i");
+ }
}
if(!inHere.length) {
@@ -253,7 +261,7 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe,
var links = inHere[j].getElementsByTagName("a");
for(var i=0; i<links.length; i++) {
if(!urlRe || urlRegexp.test(links[i].href)) {
- var text = this.getNodeString(doc, links[i], './/text()', null);
+ var text = links[i].textContent;
if(text) {
text = this.cleanString(text);
if(!rejectRe || !rejectRegexp.test(text)) {
diff --git a/scrapers.sql b/scrapers.sql
@@ -1,4 +1,4 @@
--- 84
+-- 85
-- Set the following timestamp to the most recent scraper update date
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-31 22:44:00'));
@@ -186,7 +186,15 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
title = title.substring(0, title.length-2);
}
newItem.title = Scholar.Utilities.capitalizeTitle(title);
- } else if(match[1] == ''Author(s)'') {
+ } else if(match[1] == "Series") {
+ newItem.series = match[2];
+ } else if(match[1] == "Description") {
+ var pageMatch = /([0-9]+) p\.?/
+ var m = pageMatch.exec(match[2]);
+ if(m) {
+ newItem.pages = m[1];
+ }
+ } else if(match[1] == ''Author(s)'' || match[1] == "Corp Author(s)") {
var yearRegexp = /[0-9]{4}-([0-9]{4})?/;
var authors = match[2].split('';'');
@@ -195,44 +203,33 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
for(var j=1; j<authors.length; j+=2) {
if(authors[j-1].substring(0, 1) != ''('' && !yearRegexp.test(authors[j])) {
// ignore places where there are parentheses
- newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author", true));
+ newItem.creators.push({lastName:authors[j], creatorType:"author", isInstitution:true});
}
}
} else {
newItem.creators.push(Scholar.Utilities.cleanString(match[2]));
}
} else if(match[1] == ''Publication'') {
- // Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it.
match[2] = Scholar.Utilities.cleanString(match[2]);
if(match[2].substring(match[2].length-1) == '','') {
- match[2] = match[2].substring(0, match[2].length-1);
+ match[2] = match[2].substring(0, match[2].length-1);
+ }
+
+ // most, but not all, WorldCat publisher/places are
+ // colon delimited
+ var parts = match[2].split(/ ?: ?/);
+ if(parts.length == 2) {
+ newItem.place = parts[0];
+ newItem.publisher = parts[1];
+ } else {
+ newItem.publisher = match[2];
}
- newItem.publisher = match[2];
} else if(match[1] == ''Institution'') {
newItem.publisher = match[2];
} else if(match[1] == ''Standard No'') {
- var identifiers = match[2].split(/ +/);
- var j=0;
- while(j<(identifiers.length-1)) {
- var type = identifiers[j].substring(0, identifiers[j].length-1);
- var lastChar;
- var value;
-
- j++;
- while(j<identifiers.length && (lastChar = identifiers[j].substring(identifiers[j].length-1)) != '':'') {
- if(identifiers[j].substring(0, 1) != ''('') {
- if(lastChar == '';'') {
- value = identifiers[j].substring(0, identifiers[j].length-1);
- } else {
- value = identifiers[j];
- }
- if(type == "ISBN" || type == "ISSN") {
- newItem[type] = value;
- }
- }
- j++;
- }
- }
+ var ISBNRe = /ISBN:\s*([0-9X]+)/
+ var m = ISBNRe.exec(match[2]);
+ if(m) newItem.ISBN = m[1];
} else if(match[1] == ''Year'') {
newItem.date = match[2];
} else if(match[1] == "Descriptor") {
@@ -255,7 +252,9 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006
if(match[2].substr(0, 8) != "WorldCat") {
newItem.itemType = "journalArticle";
}
- } else {
+ } else if(match[1] != "Availability" &&
+ match[1] != "Find Items About" &&
+ match[1] != "Document Type") {
newItem.extra += match[1]+": "+match[2]+"\n";
}
} else {
@@ -3635,11 +3634,6 @@ function doWeb(doc, url) {
if(articleRegexp.test(url)) {
scrape(doc);
} else {
- var namespace = doc.documentElement.namespaceURI;
- var nsResolver = namespace ? function(prefix) {
- if (prefix == ''x'') return namespace; else return null;
- } : null;
-
var items = Scholar.Utilities.getItemArray(doc, doc, ''^http://chronicle\\.com/(?:daily|weekly)/[^/]+/'');
items = Scholar.selectItems(items);
@@ -3735,11 +3729,6 @@ function doWeb(doc, url) {
if(articleRegexp.test(url)) {
scrape(doc);
} else {
- var namespace = doc.documentElement.namespaceURI;
- var nsResolver = namespace ? function(prefix) {
- if (prefix == ''x'') return namespace; else return null;
- } : null;
-
var items = Scholar.Utilities.getItemArray(doc, doc, "^http://www\\.nybooks\\.com/articles/[0-9]+/");
items = Scholar.selectItems(items);
@@ -3757,6 +3746,113 @@ function doWeb(doc, url) {
}
}');
+REPLACE INTO "translators" VALUES ('d1bf1c29-4432-4ada-8893-2e29fc88fd9e', '2006-09-06 23:27:00', 4, 'Washington Post', 'Simon Kornblith', '^http://www\.washingtonpost\.com/',
+'function detectWeb(doc, url) {
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ // don''t say we can scrape when we can''t; make sure user is logged in
+ var signedIn = doc.evaluate(''//a[text() = "Sign out" or text() = "Sign Out"]'',
+ doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if(!signedIn) {
+ return;
+ }
+
+ var articleRegexp = /http:\/\/www\.washingtonpost\.com\/wp-dyn\/content\/article\/[0-9]+\/[0-9]+\/[0-9]+\/[^\/]+\.html/
+ if(articleRegexp.test(url)) {
+ return "newspaperArticle";
+ } else {
+ var aTags = doc.getElementsByTagName("a");
+ for(var i=0; i<aTags.length; i++) {
+ if(articleRegexp.test(aTags[i].href)) {
+ return "multiple";
+ }
+ }
+ }
+}',
+'function scrape(doc) {
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ var newItem = new Scholar.Item("newspaperArticle");
+ newItem.publicationTitle = "The Washington Post";
+ newItem.ISSN = "0740-5421";
+
+ newItem.url = doc.location.href;
+ var metaTags = doc.getElementsByTagName("meta");
+
+ newItem.attachments.push({document:doc, title:"Article (HTML)",
+ downloadable:true});
+
+ // grab title from doc title
+ newItem.title = doc.title;
+
+ var byline = doc.evaluate(''//div[@id="byline"]'', doc, nsResolver,
+ XPathResult.ANY_TYPE, null).iterateNext();
+ // grab authors from byline
+ if(byline) {
+ var authors = byline.textContent.substr(3).split(" and ");
+ for each(var author in authors) {
+ newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author"));
+ }
+ }
+
+ var fonts = doc.evaluate(''//div[@id="article"]/p/font/text()'', doc, nsResolver,
+ XPathResult.ANY_TYPE, null);
+ var font;
+ while(font = fonts.iterateNext()) {
+ var pageRe = /([^;]+);([\xA0 ]+Pages?[\xA0 ]+([A-Z0-9\-]+))?/
+ // grab pages and date
+ Scholar.Utilities.debug(Scholar.Utilities.cleanString(font.nodeValue));
+ var m = pageRe.exec(font.nodeValue);
+ if(m) {
+ newItem.date = m[1];
+ newItem.pages = m[2];
+ break;
+ }
+ }
+
+ // grab tags from meta tag
+ var keywords = doc.getElementsByTagName("meta");
+ if(keywords) {
+ keywords = keywords.namedItem("keywords");
+ if(keywords) {
+ keywords = keywords.getAttribute("content");
+ if(keywords) {
+ newItem.tags = keywords.split(/, ?/);
+ }
+ }
+ }
+
+ newItem.complete();
+}
+
+function doWeb(doc, url) {
+ var articleRegexp = /http:\/\/www\.washingtonpost\.com\/wp-dyn\/content\/article\/[0-9]+\/[0-9]+\/[0-9]+\/[^\/]+\.html/
+ if(articleRegexp.test(url)) {
+ scrape(doc);
+ } else {
+ var items = Scholar.Utilities.getItemArray(doc, doc, articleRegexp);
+ items = Scholar.selectItems(items);
+
+ if(!items) {
+ return true;
+ }
+
+ var urls = new Array();
+ for(var i in items) {
+ urls.push(i);
+ }
+
+ Scholar.Utilities.processDocuments(urls, scrape, function() { Scholar.done(); });
+ Scholar.wait();
+ }
+}');
+
REPLACE INTO "translators" VALUES ('a07bb62a-4d2d-4d43-ba08-d9679a0122f8', '2006-08-26 16:14:00', 4, 'ABC-CLIO', 'Simon Kornblith', '^http://serials\.abc-clio\.com/active/go/ABC-Clio-Serials_v4.1$',
'function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;