www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit cf8dc232b13ebad70b0e4118cd1a896d010cfa48
parent 451be4b3a3b09d5b1c9d2e65e6a505c3c289280a
Author: Simon Kornblith <simon@simonster.com>
Date:   Thu,  7 Sep 2006 01:23:13 +0000

- new translators: New York Review of Books, Chronicle of Higher Education
- more useful errors in utilities
- fixes minor bugs in citation styling


Diffstat:
Mchrome/chromeFiles/content/scholar/xpcom/cite.js | 10++++------
Mchrome/chromeFiles/content/scholar/xpcom/utilities.js | 20++++++++++++++++++++
Mscrapers.sql | 222+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
3 files changed, 242 insertions(+), 10 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/xpcom/cite.js b/chrome/chromeFiles/content/scholar/xpcom/cite.js @@ -104,7 +104,6 @@ CSL.prototype.preprocessItems = function(items) { if(!item._csl || item._csl.dateModified != dateModified) { // namespace everything in item._csl so there's no chance of overlap item._csl = new Object(); - item._csl.ignore = new Array(); item._csl.dateModified = dateModified; // separate item into authors, editors, translators @@ -115,11 +114,10 @@ CSL.prototype.preprocessItems = function(items) { // parse date item._csl.date = CSL.prototype._processDate(item.getField("date")); - } else { - // clear disambiguation and subsequent author substitute - if(item._csl.disambiguation) item._csl.date.disambiguation = undefined; - if(item._csl.subsequentAuthorSubstitute) item._csl.subsequentAuthorSubstitute = undefined; } + // clear disambiguation and subsequent author substitute + if(item._csl.disambiguation) item._csl.date.disambiguation = undefined; + if(item._csl.subsequentAuthorSubstitute) item._csl.subsequentAuthorSubstitute = undefined; } // sort by sort order @@ -179,7 +177,7 @@ CSL.prototype.preprocessItems = function(items) { item._csl.number = i; // handle subsequent author substitutes - if(this._bib.subsequentAuthorSubstitute && lastAuthor == author) { + if(lastAuthor == author) { item._csl.subsequentAuthorSubstitute = true; } lastAuthor = author; diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -26,6 +26,10 @@ Scholar.Utilities.prototype.strToDate = function(date) { * Cleans extraneous punctuation off an author name */ Scholar.Utilities.prototype.cleanAuthor = function(author, type, useComma) { + if(typeof(author) != "string") { + throw "cleanAuthor: author must be a string"; + } + author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); author = author.replace(/ +/, ' '); @@ -54,6 +58,10 @@ Scholar.Utilities.prototype.cleanAuthor = function(author, type, useComma) { * Cleans whitespace off a string and replaces multiple spaces with one */ Scholar.Utilities.prototype.cleanString = function(s) { + if(typeof(s) != "string") { + throw "cleanString: argument must be a string"; + } + s = s.replace(/[ \xA0\r\n]+/g, " "); s = s.replace(/^\s+/, ""); return s.replace(/\s+$/, ""); @@ -63,6 +71,10 @@ Scholar.Utilities.prototype.cleanString = function(s) { * Cleans any non-word non-parenthesis characters off the ends of a string */ Scholar.Utilities.prototype.superCleanString = function(x) { + if(typeof(s) != "string") { + throw "superCleanString: argument must be a string"; + } + var x = x.replace(/^[^\w(]+/, ""); return x.replace(/[^\w)]+$/, ""); } @@ -71,6 +83,10 @@ Scholar.Utilities.prototype.superCleanString = function(x) { * Eliminates HTML tags, replacing <br>s with /ns */ Scholar.Utilities.prototype.cleanTags = function(x) { + if(typeof(s) != "string") { + throw "cleanTags: argument must be a string"; + } + x = x.replace(/<br[^>]*>/gi, "\n"); return x.replace(/<[^>]+>/g, ""); } @@ -118,6 +134,10 @@ Scholar.Utilities.prototype.inArray = Scholar.inArray; * pads a number or other string with a given string on the left */ Scholar.Utilities.prototype.lpad = function(string, pad, length) { + if(typeof(s) != "string") { + throw "lpad: argument must be a string"; + } + while(string.length < length) { string = pad + string; } diff --git a/scrapers.sql b/scrapers.sql @@ -1,4 +1,4 @@ --- 81 +-- 82 -- Set the following timestamp to the most recent scraper update date REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-31 22:44:00')); @@ -3458,7 +3458,7 @@ function scrape(doc, url) { return; } - newItem.attachments.push({url:url, title:"New York Times Article", + newItem.attachments.push({url:url, title:"Article (HTML)", mimeType:"text/html", downloadable:true}); } else { newItem.url = doc.location.href; @@ -3471,7 +3471,7 @@ function scrape(doc, url) { } } - newItem.attachments.push({document:doc, title:"New York Times Article", + newItem.attachments.push({document:doc, title:"Article (HTML)", downloadable:true}); } @@ -3543,6 +3543,220 @@ function doWeb(doc, url) { } }'); +REPLACE INTO "translators" VALUES ('1e6d1529-246f-4429-84e2-1f1b180b250d', '2006-09-06 17:54:00', 4, 'Chronicle of Higher Education', 'Simon Kornblith', '^http://chronicle\.com/', +'function detectWeb(doc, url) { + var articleRegexp = /^http:\/\/chronicle\.com\/(?:daily|weekly)\/[^/]+\// + if(articleRegexp.test(url)) { + if(doc.location.href.indexOf("weekly") != -1) { + return "magazineArticle"; + } else { + return "website"; + } + } else { + var aTags = doc.getElementsByTagName("a"); + for(var i=0; i<aTags.length; i++) { + if(articleRegexp.test(aTags[i].href)) { + return "multiple"; + } + } + } +}', +'function associateMeta(newItem, metaTags, field, scholarField) { + if(metaTags.namedItem(field)) { + newItem[scholarField] = Scholar.Utilities.cleanString(metaTags.namedItem(field).getAttribute("content")); + } +} + +function scrape(doc) { + if(doc.location.href.indexOf("weekly") != -1) { + var newItem = new Scholar.Item("magazineArticle"); + + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + // go in search of pages + var content = doc.evaluate(''/html/body/table[@class="layout"]/tbody/tr[1]/td[@class="content"]'', + doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if(content) { + var pagesRegexp = /http:\/\/chronicle.com\nSection: [^\n]+\nVolume [0-9]+, Issue [0-9]+, Pages? ([A-Z0-9\-]+)/; + var m = pagesRegexp.exec(content.textContent); + if(m) { + newItem.pages = m[1]; + } + } + } else { + var newItem = new Scholar.Item("website"); + } + newItem.publicationTitle = "The Chronicle of Higher Education"; + newItem.ISSN = "0009-5982"; + + newItem.url = doc.location.href; + var metaTags = doc.getElementsByTagName("meta"); + + newItem.attachments.push({document:doc, title:"Article (HTML)", + downloadable:true}); + + associateMeta(newItem, metaTags, "published_date", "date"); + associateMeta(newItem, metaTags, "headline", "title"); + associateMeta(newItem, metaTags, "section", "section"); + associateMeta(newItem, metaTags, "volume", "volume"); + associateMeta(newItem, metaTags, "issue", "issue"); + + if(metaTags.namedItem("byline")) { + var author = Scholar.Utilities.cleanString(metaTags.namedItem("byline").getAttribute("content")); + if(author.substr(0, 3).toLowerCase() == "by ") { + author = author.substr(3); + } + + var authors = author.split(" and "); + for each(var author in authors) { + // fix capitalization + var words = author.split(" "); + for(var i in words) { + words[i] = words[i][0].toUpperCase()+words[i].substr(1).toLowerCase(); + } + author = words.join(" "); + + if(words[0] == "The") { + newItem.creators.push({lastName:author, creatorType:"author"}); + } else { + newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author")); + } + } + } + + newItem.complete(); +} + +function doWeb(doc, url) { + var articleRegexp = /^http:\/\/chronicle\.com\/(?:daily|weekly)\/[^/]+\//; + if(articleRegexp.test(url)) { + scrape(doc); + } else { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var items = Scholar.Utilities.getItemArray(doc, doc, ''^http://chronicle\\.com/(?:daily|weekly)/[^/]+/''); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var urls = new Array(); + for(var i in items) { + urls.push(i); + } + + Scholar.Utilities.processDocuments(urls, scrape, function() { Scholar.done(); }); + Scholar.wait(); + } +}'); + +REPLACE INTO "translators" VALUES ('4c164cc8-be7b-4d02-bfbf-37a5622dfd56', '2006-09-06 18:54:00', 4, 'New York Review of Books', 'Simon Kornblith', '^http://www\.nybooks\.com/', +'function detectWeb(doc, url) { + var articleRegexp = /^http:\/\/www\.nybooks\.com\/articles\/[0-9]+/ + if(articleRegexp.test(url)) { + return "journalArticle"; + } else { + var aTags = doc.getElementsByTagName("a"); + for(var i=0; i<aTags.length; i++) { + if(articleRegexp.test(aTags[i].href)) { + return "multiple"; + } + } + } +}', +'function associateMeta(newItem, metaTags, field, scholarField) { + if(metaTags.namedItem(field)) { + newItem[scholarField] = Scholar.Utilities.cleanString(metaTags.namedItem(field).getAttribute("content")); + } +} + +function scrape(doc) { + var newItem = new Scholar.Item("journalArticle"); + newItem.publicationTitle = "The New York Review of Books"; + newItem.ISSN = "0028-7504"; + + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + newItem.url = doc.location.href; + var metaTags = doc.getElementsByTagName("meta"); + + newItem.attachments.push({document:doc, title:"Review (HTML)", + downloadable:true}); + + associateMeta(newItem, metaTags, "dc.title", "title"); + + var info = doc.evaluate(''//div[@id="center-content"]/h4[@class="date"]'', + doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + + if(info) { + // get date (which is in an a tag) + newItem.date = doc.evaluate("./a", info, nsResolver, XPathResult.ANY_TYPE, + null).iterateNext(); + if(newItem.date) { + newItem.date = newItem.date.textContent; + } + + info = Scholar.Utilities.cleanString(info.textContent); + + // get volume and issue + var infoRe = /Volume ([0-9]+), Number ([0-9]+)/; + var m = infoRe.exec(info); + if(m) { + newItem.volume = m[1]; + newItem.issue = m[2]; + } + } + + + var authors = doc.evaluate(''//div[@id="center-content"]/h4/a[substring(@href, 1, 9) = "/authors/"]'', + doc, nsResolver, XPathResult.ANY_TYPE, null); + + + var author; + while(author = authors.iterateNext()) { + newItem.creators.push(Scholar.Utilities.cleanAuthor(author.textContent, "author", false)); + } + + newItem.complete(); +} + +function doWeb(doc, url) { + var articleRegexp = /^http:\/\/www\.nybooks\.com\/articles\/[0-9]+/ + if(articleRegexp.test(url)) { + scrape(doc); + } else { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var items = Scholar.Utilities.getItemArray(doc, doc, "^http://www\\.nybooks\\.com/articles/[0-9]+/"); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var urls = new Array(); + for(var i in items) { + urls.push(i); + } + + Scholar.Utilities.processDocuments(urls, scrape, function() { Scholar.done(); }); + Scholar.wait(); + } +}'); + REPLACE INTO "translators" VALUES ('a07bb62a-4d2d-4d43-ba08-d9679a0122f8', '2006-08-26 16:14:00', 4, 'ABC-CLIO', 'Simon Kornblith', '^http://serials\.abc-clio\.com/active/go/ABC-Clio-Serials_v4.1$', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; @@ -5401,7 +5615,7 @@ var inputTypeMap = { INPR:"manuscript", JFULL:"journalArticle", MAP:"artwork", - PAMP:"book", + PAMP:"manuscript", RPRT:"book", SER:"book", SLIDE:"artwork",