www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 7b7d3d85e38aa77c4c47993ea03a451770afedda
parent b8ddba3a67383eeddf55f12a47186db102b43a69
Author: Simon Kornblith <simon@simonster.com>
Date:   Fri,  8 Sep 2006 05:47:47 +0000

- added Washington Post translator
- translation works properly even when a user has switched to a different page


Diffstat:
Mchrome/chromeFiles/content/scholar/ingester/browser.js | 2--
Mchrome/chromeFiles/content/scholar/xpcom/translate.js | 51+++++++++++++++++++++++++++++----------------------
Mchrome/chromeFiles/content/scholar/xpcom/utilities.js | 20++++++++++++++------
Mscrapers.sql | 174+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------
4 files changed, 178 insertions(+), 69 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -167,8 +167,6 @@ Scholar_Ingester_Interface.tabClose = function(event) { Scholar_Ingester_Interface.tabSelect = function(event) { var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser); Scholar_Ingester_Interface._updateStatus(data); - // Make sure scrape progress is gone - Scholar_Ingester_Interface.Progress.kill(); } Scholar_Ingester_Interface.hidePopup = function(collectionID) { diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js @@ -558,14 +558,13 @@ Scholar.Translate.prototype._generateSandbox = function() { this._sandbox.Scholar.nextItem = function() { return me._exportGetItem() }; this._sandbox.Scholar.nextCollection = function() { return me._exportGetCollection() } } else { - // add routines to add new items - this._sandbox.Scholar.Item = Scholar.Translate.ScholarItem; - // attach the function to be run when an item is done + // copy routines to add new items + this._sandbox.Scholar.Item = Scholar.Translate.GenerateScholarItemClass(); this._sandbox.Scholar.Item.prototype.complete = function() {me._itemDone(this)}; if(this.type == "import") { // add routines to add new collections - this._sandbox.Scholar.Collection = Scholar.Translate.ScholarCollection; + this._sandbox.Scholar.Collection = Scholar.Translate.GenerateScholarItemClass(); // attach the function to be run when a collection is done this._sandbox.Scholar.Collection.prototype.complete = function() {me._collectionDone(this)}; } @@ -882,7 +881,7 @@ Scholar.Translate.prototype._translationComplete = function(returnValue) { Scholar.Notifier.trigger("add", "item", this.newItems); } // notify collectionTreeView about updates - if(this.newCollections.length) { + if(this.newCollections && this.newCollections.length) { Scholar.Notifier.trigger("add", "collection", this.newCollections); } } @@ -1007,7 +1006,7 @@ Scholar.Translate.prototype._itemTagsAndSeeAlso = function(item, newItem) { /* * executed when an item is done and ready to be loaded into the database */ -Scholar.Translate.prototype._itemDone = function(item) { +Scholar.Translate.prototype._itemDone = function(item) { if(!this.saveItem) { // if we're not supposed to save the item, just // return the item array @@ -1056,7 +1055,7 @@ Scholar.Translate.prototype._itemDone = function(item) { item.itemType = item.complete = undefined; // automatically set access date if URL is set - if(item.url && !item.accessDate) { + if(item.url && !item.accessDate && this.type == "web") { item.accessDate = (new Date()).toLocaleString(); } @@ -1778,26 +1777,34 @@ Scholar.Translate.prototype._storageFunctions = function(read, write) { * inside scraper code */ -Scholar.Translate.ScholarItem = function(itemType) { - // assign item type - this.itemType = itemType; - // generate creators array - this.creators = new Array(); - // generate notes array - this.notes = new Array(); - // generate tags array - this.tags = new Array(); - // generate see also array - this.seeAlso = new Array(); - // generate file array - this.attachments = new Array(); +Scholar.Translate.GenerateScholarItemClass = function() { + var ScholarItem = function(itemType) { + // assign item type + this.itemType = itemType; + // generate creators array + this.creators = new Array(); + // generate notes array + this.notes = new Array(); + // generate tags array + this.tags = new Array(); + // generate see also array + this.seeAlso = new Array(); + // generate file array + this.attachments = new Array(); + }; + + return ScholarItem; } /* Scholar.Translate.Collection: a class for generating a new top-level * collection from inside scraper code */ - -Scholar.Translate.ScholarCollection = function() {} + +Scholar.Translate.GenerateScholarCollectionClass = function() { + var ScholarCollection = Scholar.Translate.ScholarCollection = function() {}; + + return ScholarCollection; +} /* Scholar.Translate.RDF: a class for handling RDF IO * diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -62,7 +62,7 @@ Scholar.Utilities.prototype.cleanString = function(s) { throw "cleanString: argument must be a string"; } - s = s.replace(/[ \xA0\r\n]+/g, " "); + s = s.replace(/[\xA0\r\n\s]+/g, " "); s = s.replace(/^\s+/, ""); return s.replace(/\s+$/, ""); } @@ -236,13 +236,21 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe, // Require link to match this if(urlRe) { - var urlRegexp = new RegExp(); - urlRegexp.compile(urlRe, "i"); + if(urlRe.exec) { + var urlRegexp = urlRe; + } else { + var urlRegexp = new RegExp(); + urlRegexp.compile(urlRe, "i"); + } } // Do not allow text to match this if(rejectRe) { - var rejectRegexp = new RegExp(); - rejectRegexp.compile(rejectRe, "i"); + if(rejectRe.exec) { + var rejectRegexp = rejectRe; + } else { + var rejectRegexp = new RegExp(); + rejectRegexp.compile(rejectRe, "i"); + } } if(!inHere.length) { @@ -253,7 +261,7 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe, var links = inHere[j].getElementsByTagName("a"); for(var i=0; i<links.length; i++) { if(!urlRe || urlRegexp.test(links[i].href)) { - var text = this.getNodeString(doc, links[i], './/text()', null); + var text = links[i].textContent; if(text) { text = this.cleanString(text); if(!rejectRe || !rejectRegexp.test(text)) { diff --git a/scrapers.sql b/scrapers.sql @@ -1,4 +1,4 @@ --- 84 +-- 85 -- Set the following timestamp to the most recent scraper update date REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-31 22:44:00')); @@ -186,7 +186,15 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006 title = title.substring(0, title.length-2); } newItem.title = Scholar.Utilities.capitalizeTitle(title); - } else if(match[1] == ''Author(s)'') { + } else if(match[1] == "Series") { + newItem.series = match[2]; + } else if(match[1] == "Description") { + var pageMatch = /([0-9]+) p\.?/ + var m = pageMatch.exec(match[2]); + if(m) { + newItem.pages = m[1]; + } + } else if(match[1] == ''Author(s)'' || match[1] == "Corp Author(s)") { var yearRegexp = /[0-9]{4}-([0-9]{4})?/; var authors = match[2].split('';''); @@ -195,44 +203,33 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006 for(var j=1; j<authors.length; j+=2) { if(authors[j-1].substring(0, 1) != ''('' && !yearRegexp.test(authors[j])) { // ignore places where there are parentheses - newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author", true)); + newItem.creators.push({lastName:authors[j], creatorType:"author", isInstitution:true}); } } } else { newItem.creators.push(Scholar.Utilities.cleanString(match[2])); } } else if(match[1] == ''Publication'') { - // Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it. match[2] = Scholar.Utilities.cleanString(match[2]); if(match[2].substring(match[2].length-1) == '','') { - match[2] = match[2].substring(0, match[2].length-1); + match[2] = match[2].substring(0, match[2].length-1); + } + + // most, but not all, WorldCat publisher/places are + // colon delimited + var parts = match[2].split(/ ?: ?/); + if(parts.length == 2) { + newItem.place = parts[0]; + newItem.publisher = parts[1]; + } else { + newItem.publisher = match[2]; } - newItem.publisher = match[2]; } else if(match[1] == ''Institution'') { newItem.publisher = match[2]; } else if(match[1] == ''Standard No'') { - var identifiers = match[2].split(/ +/); - var j=0; - while(j<(identifiers.length-1)) { - var type = identifiers[j].substring(0, identifiers[j].length-1); - var lastChar; - var value; - - j++; - while(j<identifiers.length && (lastChar = identifiers[j].substring(identifiers[j].length-1)) != '':'') { - if(identifiers[j].substring(0, 1) != ''('') { - if(lastChar == '';'') { - value = identifiers[j].substring(0, identifiers[j].length-1); - } else { - value = identifiers[j]; - } - if(type == "ISBN" || type == "ISSN") { - newItem[type] = value; - } - } - j++; - } - } + var ISBNRe = /ISBN:\s*([0-9X]+)/ + var m = ISBNRe.exec(match[2]); + if(m) newItem.ISBN = m[1]; } else if(match[1] == ''Year'') { newItem.date = match[2]; } else if(match[1] == "Descriptor") { @@ -255,7 +252,9 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006 if(match[2].substr(0, 8) != "WorldCat") { newItem.itemType = "journalArticle"; } - } else { + } else if(match[1] != "Availability" && + match[1] != "Find Items About" && + match[1] != "Document Type") { newItem.extra += match[1]+": "+match[2]+"\n"; } } else { @@ -3635,11 +3634,6 @@ function doWeb(doc, url) { if(articleRegexp.test(url)) { scrape(doc); } else { - var namespace = doc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - var items = Scholar.Utilities.getItemArray(doc, doc, ''^http://chronicle\\.com/(?:daily|weekly)/[^/]+/''); items = Scholar.selectItems(items); @@ -3735,11 +3729,6 @@ function doWeb(doc, url) { if(articleRegexp.test(url)) { scrape(doc); } else { - var namespace = doc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - var items = Scholar.Utilities.getItemArray(doc, doc, "^http://www\\.nybooks\\.com/articles/[0-9]+/"); items = Scholar.selectItems(items); @@ -3757,6 +3746,113 @@ function doWeb(doc, url) { } }'); +REPLACE INTO "translators" VALUES ('d1bf1c29-4432-4ada-8893-2e29fc88fd9e', '2006-09-06 23:27:00', 4, 'Washington Post', 'Simon Kornblith', '^http://www\.washingtonpost\.com/', +'function detectWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + // don''t say we can scrape when we can''t; make sure user is logged in + var signedIn = doc.evaluate(''//a[text() = "Sign out" or text() = "Sign Out"]'', + doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(!signedIn) { + return; + } + + var articleRegexp = /http:\/\/www\.washingtonpost\.com\/wp-dyn\/content\/article\/[0-9]+\/[0-9]+\/[0-9]+\/[^\/]+\.html/ + if(articleRegexp.test(url)) { + return "newspaperArticle"; + } else { + var aTags = doc.getElementsByTagName("a"); + for(var i=0; i<aTags.length; i++) { + if(articleRegexp.test(aTags[i].href)) { + return "multiple"; + } + } + } +}', +'function scrape(doc) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var newItem = new Scholar.Item("newspaperArticle"); + newItem.publicationTitle = "The Washington Post"; + newItem.ISSN = "0740-5421"; + + newItem.url = doc.location.href; + var metaTags = doc.getElementsByTagName("meta"); + + newItem.attachments.push({document:doc, title:"Article (HTML)", + downloadable:true}); + + // grab title from doc title + newItem.title = doc.title; + + var byline = doc.evaluate(''//div[@id="byline"]'', doc, nsResolver, + XPathResult.ANY_TYPE, null).iterateNext(); + // grab authors from byline + if(byline) { + var authors = byline.textContent.substr(3).split(" and "); + for each(var author in authors) { + newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author")); + } + } + + var fonts = doc.evaluate(''//div[@id="article"]/p/font/text()'', doc, nsResolver, + XPathResult.ANY_TYPE, null); + var font; + while(font = fonts.iterateNext()) { + var pageRe = /([^;]+);([\xA0 ]+Pages?[\xA0 ]+([A-Z0-9\-]+))?/ + // grab pages and date + Scholar.Utilities.debug(Scholar.Utilities.cleanString(font.nodeValue)); + var m = pageRe.exec(font.nodeValue); + if(m) { + newItem.date = m[1]; + newItem.pages = m[2]; + break; + } + } + + // grab tags from meta tag + var keywords = doc.getElementsByTagName("meta"); + if(keywords) { + keywords = keywords.namedItem("keywords"); + if(keywords) { + keywords = keywords.getAttribute("content"); + if(keywords) { + newItem.tags = keywords.split(/, ?/); + } + } + } + + newItem.complete(); +} + +function doWeb(doc, url) { + var articleRegexp = /http:\/\/www\.washingtonpost\.com\/wp-dyn\/content\/article\/[0-9]+\/[0-9]+\/[0-9]+\/[^\/]+\.html/ + if(articleRegexp.test(url)) { + scrape(doc); + } else { + var items = Scholar.Utilities.getItemArray(doc, doc, articleRegexp); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var urls = new Array(); + for(var i in items) { + urls.push(i); + } + + Scholar.Utilities.processDocuments(urls, scrape, function() { Scholar.done(); }); + Scholar.wait(); + } +}'); + REPLACE INTO "translators" VALUES ('a07bb62a-4d2d-4d43-ba08-d9679a0122f8', '2006-08-26 16:14:00', 4, 'ABC-CLIO', 'Simon Kornblith', '^http://serials\.abc-clio\.com/active/go/ABC-Clio-Serials_v4.1$', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI;