www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 6626eba844e847a2490b7508973d1a3abbf68e4e
parent e3d062a81968ed828d71bf451c1c311ce4742ff2
Author: Simon Kornblith <simon@simonster.com>
Date:   Mon,  7 Aug 2006 05:15:30 +0000

addresses #83, figure out how to implement OpenURL

OpenURL lookup now works for books. this means that all that's necessary to add scrapable book metadata to a page is an ISBN, as shown below:

<span class="Z3988" title="ctx_ver=Z39.88-2004&amp;rft_val_fmt=info:ofi/fmt:kev:mtx:book&amp;rft.isbn=1579550088"></span>

also, we can now scrape Open WorldCat and Wikipedia Book Sources pages with no specialized code involved.

i'm still looking for a better way of looking up journal article metadata. it's currently implemented with CrossRef, but CrossRef simply will not work without a DOI, and is also incomplete (only holds the last name of the first author).


Diffstat:
Mchrome/chromeFiles/content/scholar/xpcom/ingester.js | 296++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mchrome/chromeFiles/content/scholar/xpcom/utilities.js | 8++++++++
Mscrapers.sql | 216+++++++++++++++++++++++++++++++------------------------------------------------
3 files changed, 388 insertions(+), 132 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -147,6 +147,8 @@ Scholar.OpenURL = new function() { this.resolve = resolve; this.discoverResolvers = discoverResolvers; this.createContextObject = createContextObject; + this.parseContextObject = parseContextObject; + this.lookupContextObject = lookupContextObject; /* * Returns a URL to look up an item in the OpenURL resolver @@ -224,7 +226,7 @@ Scholar.OpenURL = new function() { co += "&id="+escape(identifier); } } else { - var co = "ctx_ver=Z39.88-2004"; + var co = "url_ver=Z39.88-2004&ctx_ver=Z39.88-2004"; for each(identifier in identifiers) { co += "&rft_id="+escape(identifier); @@ -300,6 +302,298 @@ Scholar.OpenURL = new function() { return co; } + /* + * Generates an item in the format returned by item.fromArray() given an + * OpenURL version 1.0 contextObject + */ + function parseContextObject(co) { + var coParts = co.split("&"); + + var item = new Array(); + item.creators = new Array(); + + // get type + item.itemType = _determineResourceType(coParts); + if(!item.itemType) { + return false; + } + + var pagesKey = ""; + + for each(part in coParts) { + var keyVal = part.split("="); + var key = keyVal[0]; + var value = unescape(keyVal[1].replace(/\+|%2[bB]/g, " ")); + if(!value) { + continue; + } + + if(key == "rft_id") { + var firstEight = value.substr(0, 8).toLowerCase(); + if(firstEight == "info:doi") { + item.DOI = value; + } else if(firstEight == "urn:isbn") { + item.ISBN = value.substr(9); + } + } else if(key == "rft.btitle") { + if(item.itemType == "book") { + item.title = value; + } else if(item.itemType == "bookSection") { + item.publicationTitle = value; + } + } else if(key == "rft.atitle" && item.itemType != "book") { + item.title = value; + } else if(key == "rft.jtitle" && item.itemType == "journal") { + item.publcation = value; + } else if(key == "rft.stitle" && item.itemType == "journal") { + item.journalAbbreviation = value; + } else if(key == "rft.date") { + item.date = value; + } else if(key == "rft.volume") { + item.volume = value; + } else if(key == "rft.issue") { + item.issue = value; + } else if(key == "rft.pages") { + pagesKey = key; + item.pages = value; + } else if(key == "rft.spage") { + if(pagesKey != "rft.pages") { + pagesKey = key; + // make pages look like start-end + if(pagesKey == "rft.epage") { + if(value != item.pages) { + item.pages = value+"-"+item.pages; + } + } else { + item.pages = value; + } + } + } else if(key == "rft.epage") { + if(pagesKey != "rft.pages") { + pagesKey = key; + // make pages look like start-end + if(pagesKey == "rft.spage") { + if(value != item.pages) { + item.pages = +item.pages+"-"+value; + } + } else { + item.pages = value; + } + } + } else if(key == "issn" || (key == "eissn" && !item.ISSN)) { + item.ISSN = value; + } else if(key == "rft.aulast") { + var lastCreator = item.creators[item.creators.length-1]; + if(item.creators.length && !lastCreator.lastName && !lastCreator.institutional) { + lastCreator.lastName = value; + } else { + item.creators.push({lastName:value}); + } + } else if(key == "rft.aufirst") { + var lastCreator = item.creators[item.creators.length-1]; + if(item.creators.length && !lastCreator.firstName && !lastCreator.institutional) { + lastCreator.firstName = value; + } else { + item.creators.push({firstName:value}); + } + } else if(key == "rft.au") { + item.creators.push(Scholar.cleanAuthor(value, "author", true)); + } else if(key == "rft.aucorp") { + item.creators.push({lastName:value, institutional:true}); + } else if(key == "rft.isbn" && !item.ISBN) { + item.ISBN = value; + } else if(key == "rft.pub") { + item.publisher = value; + } else if(key == "rft.place") { + item.place = value; + } else if(key == "rft.edition") { + item.edition = value; + } else if(key == "rft.series") { + item.seriesTitle = value; + } + } + + return item; + } + + /* + * Looks up additional information on an item in the format returned by + * item.fromArray() in CrossRef or Open WorldCat given an OpenURL version + * 1.0 contextObject + */ + function lookupContextObject(co, done, error) { + // CrossRef requires a url_ver to work right + if(co.indexOf("url_ver=Z39.88-2004") == -1) { + co = "url_ver=Z39.88-2004&"+co; + } + + var type = _determineResourceType(co.split("&")); + if(!type) { + return false; + } + + if(type == "journal") { + // look up journals in CrossRef + Scholar.Utilities.HTTP.doGet("http://www.crossref.org/openurl/?"+co+"&noredirect=true", null, function(req) { + var items = _processCrossRef(req.responseText); + done(items); + }); + } else { + // look up books in Open WorldCat + Scholar.Utilities.HTTP.processDocuments(null, ["http://partneraccess.oclc.org/wcpa/servlet/OpenUrl?"+co], function(browser) { + var doc = browser.contentDocument; + // find new COinS in the Open WorldCat page + items = _processOWC(doc); + + if(items) { // we got a single item page; return the item + done(items); + } else { // assume we have a search results page + var items = new Array(); + + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == 'x') return namespace; else return null; + } : null; + + // first try to get only books + var elmts = doc.evaluate('//table[@class="tableLayout"]/tbody/tr/td[@class="content"]/table[@class="tableResults"]/tbody/tr[td/img[@alt="Book"]]/td/div[@class="title"]/a', doc, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); + var elmt = elmts.iterateNext(); + if(!elmt) { // if that fails, look for other options + var elmts = doc.evaluate('//table[@class="tableLayout"]/tbody/tr/td[@class="content"]/table[@class="tableResults"]/tbody/tr[td/img[@alt="Book"]]/td/div[@class="title"]/a', doc, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); + elmt = elmts.iterateNext() + } + + var urlsToProcess = new Array(); + do { + urlsToProcess.push(elmt.href); + } while(elmt = elmts.iterateNext()); + + Scholar.Utilities.HTTP.processDocuments(null, urlsToProcess, function(browser) { + // per URL + var newItems = _processOWC(browser.contentDocument); + if(newItems) { + items = items.concat(newItems); + } + }, function() { // done + done(items); + }, function() { // error + error(); + }); + } + }, null, function() { + error(); + }); + } + } + + /* + * Processes the XML format returned by CrossRef + */ + function _processCrossRef(xmlOutput) { + xmlOutput = xmlOutput.replace(/<\?xml[^>]*\?>/, ""); + + // parse XML with E4X + var qr = new Namespace("http://www.crossref.org/qrschema/2.0"); + try { + var xml = new XML(xmlOutput); + } catch(e) { + return false; + } + + // ensure status is valid + var status = xml.qr::body.qr::query.@status.toString(); + if(status != "resolved" && status != "multiresolved") { + return false; + } + + var query = xml.qr::body.qr::query; + var item = new Array(); + item.creators = new Array(); + + // try to get a DOI + item.DOI = query.qr::doi.(@type=="journal_article").toString(); + if(!item.DOI) { + item.DOI = query.qr::doi.(@type=="book_title").toString(); + } + if(!item.DOI) { + item.DOI = query.qr::doi.(@type=="book_content").toString(); + } + + // try to get an ISSN (no print/electronic preferences) + item.ISSN = query.qr::issn.toString(); + // get title + item.title = query.qr::article_title.toString(); + // get publicationTitle + item.publicationTitle = query.qr::journal_title.toString(); + // get author + item.creators.push(Scholar.Utilities.cleanAuthor(query.qr::author.toString(), "author", true)); + // get volume + item.volume = query.qr::volume.toString(); + // get issue + item.issue = query.qr::issue.toString(); + // get year + item.date = query.qr::year.toString(); + // get edition + item.edition = query.qr::edition_number.toString(); + // get first page + item.pages = query.qr::first_page.toString(); + + return [item]; + } + + /* + * Parses a document object referring to an Open WorldCat entry for its + * OpenURL contextObject, then returns an item generated from this + * contextObject + */ + function _processOWC(doc) { + var spanTags = doc.getElementsByTagName("span"); + for(var i=0; i<spanTags.length; i++) { + var spanClass = spanTags[i].getAttribute("class"); + if(spanClass) { + var spanClasses = spanClass.split(" "); + if(Scholar.inArray("Z3988", spanClasses)) { + var spanTitle = spanTags[i].getAttribute("title"); + var item = parseContextObject(spanTitle); + if(item) { + return [item]; + } else { + return false; + } + } + } + } + + return false; + } + + /* + * Determines the type of an OpenURL contextObject + */ + function _determineResourceType(coParts) { + // determine resource type + var type = false; + for(var i in coParts) { + if(coParts[i].substr(0, 12) == "rft_val_fmt=") { + var format = unescape(coParts[i].substr(12)); + if(format == "info:ofi/fmt:kev:mtx:journal") { + var type = "journal"; + } else if(format == "info:ofi/fmt:kev:mtx:book") { + if(Scholar.inArray("rft.genre=bookitem", coParts)) { + var type = "bookSection"; + } else { + var type = "book"; + } + break; + } + } + } + return type; + } + + /* + * Used to map tags for generating OpenURL contextObjects + */ function _mapTag(data, tag, version) { if(data) { if(version == "0.1") { diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -317,6 +317,14 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe, return availableItems; } +Scholar.Utilities.Ingester.prototype.lookupContextObject = function(co, done, error) { + return Scholar.OpenURL.lookupContextObject(co, done, error); +} + +Scholar.Utilities.Ingester.prototype.parseContextObject = function(co) { + return Scholar.OpenURL.parseContextObject(co); +} + /* * END SCHOLAR FOR FIREFOX EXTENSIONS */ diff --git a/scrapers.sql b/scrapers.sql @@ -1,7 +1,7 @@ --- 37 +-- 38 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-06 21:45:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-07 01:09:00')); REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 4, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)', 'function detect(doc, url) { @@ -2332,7 +2332,7 @@ REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006 translator.doImport(); }'); -REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006-08-06 19:14:00', 4, 'COinS Scraper', 'Simon Kornblith', NULL, +REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006-08-07 01:09:00', 4, 'COinS Scraper', 'Simon Kornblith', NULL, 'function detect(doc, url) { var spanTags = doc.getElementsByTagName("span"); @@ -2345,18 +2345,31 @@ REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006 if(Scholar.Utilities.inArray("Z3988", spanClasses)) { var spanTitle = spanTags[i].getAttribute("title"); - if(spanTitle.indexOf("rft_val_fmt=info:ofi/fmt:kev:mtx:journal") != -1) { - var type = "journal"; - } else if(spanTitle.indexOf("rft_val_fmt=info:ofi/fmt:kev:mtx:book") != -1) { - var type = "book"; - } else { - continue; + // determine if it''s a valid type + var coParts = spanTitle.split("&"); + var type = null + for(var i in coParts) { + if(coParts[i].substr(0, 12) == "rft_val_fmt=") { + var format = unescape(coParts[i].substr(12)); + if(format == "info:ofi/fmt:kev:mtx:journal") { + var type = "journal"; + } else if(format == "info:ofi/fmt:kev:mtx:book") { + if(Scholar.Utilities.inArray("rft.genre=bookitem", coParts)) { + var type = "bookSection"; + } else { + var type = "book"; + } + break; + } + } } - if(encounteredType) { - return "multiple"; - } else { - encounteredType = type; + if(type) { + if(encounteredType) { + return "multiple"; + } else { + encounteredType = type; + } } } } @@ -2364,119 +2377,58 @@ REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006 return encounteredType; }', -'function parseContextObject(co) { - if(co.indexOf("rft_val_fmt=info:ofi/fmt:kev:mtx:journal") != -1) { - var type = "journal"; +'// used to retrieve next COinS object when asynchronously parsing COinS objects +// on a page +function retrieveNextCOinS(needFullItems, newItems) { + if(needFullItems.length) { + var item = needFullItems.shift(); + + Scholar.Utilities.debugPrint("looking up contextObject"); + Scholar.Utilities.lookupContextObject(item.contextObject, function(items) { + Scholar.Utilities.debugPrint(items); + if(items) { + newItems = newItems.concat(items); + } + retrieveNextCOinS(needFullItems, newItems); + }, function() { + Scholar.done(false); + }); } else { - if(co.indexOf("rft.genre=bookitem") != -1) { - var type = "bookSection"; - } else { - var type = "book" - } + completeCOinS(newItems); + Scholar.done(true); } - var item = new Scholar.Item(type); - - var pagesKey = ""; - - var coParts = co.split("&"); - for each(part in coParts) { - var keyVal = part.split("="); - var key = keyVal[0]; - var value = unescape(keyVal[1].replace(/\+/g, " ")); - if(!value) { - continue; - } +} + +// attaches item data to a new Scholar.Item instance (because data returned from +// Scholar.OpenURL.processContextObject does not have a complete() method) +function addAsItem(itemArray) { + var newItem = new Scholar.Item(); + for(var i in itemArray) { + newItem[i] = itemArray[i]; + } + newItem.complete(); +} + +// saves all COinS objects +function completeCOinS(newItems) { + if(newItems.length > 1) { + var selectArray = new Array(); - if(key == "rft_id") { - var firstEight = value.substr(0, 8).toLowerCase(); - if(firstEight == "info:doi") { - item.DOI = value; - } else if(firstEight == "urn:isbn") { - item.ISBN = value.substr(9); - } - } else if(key == "rft.btitle") { - if(item.itemType == "book") { - item.title = value; - } else if(item.itemType == "bookSection") { - item.publicationTitle = value; - } - } else if(key == "rft.atitle" && item.itemType != "book") { - item.title = value; - } else if(key == "rft.jtitle" && item.itemType == "journal") { - item.publcation = value; - } else if(key == "rft.stitle" && item.itemType == "journal") { - item.journalAbbreviation = value; - } else if(key == "rft.date") { - item.date = value; - } else if(key == "rft.volume") { - item.volume = value; - } else if(key == "rft.issue") { - item.issue = value; - } else if(key == "rft.pages") { - pagesKey = key; - item.pages = value; - } else if(key == "rft.spage") { - if(pagesKey != "rft.pages") { - pagesKey = key; - // make pages look like start-end - if(pagesKey == "rft.epage") { - if(value != item.pages) { - item.pages = value+"-"+item.pages; - } - } else { - item.pages = value; - } - } - } else if(key == "rft.epage") { - if(pagesKey != "rft.pages") { - pagesKey = key; - // make pages look like start-end - if(pagesKey == "rft.spage") { - if(value != item.pages) { - item.pages = +item.pages+"-"+value; - } - } else { - item.pages = value; - } - } - } else if(key == "issn" || (key == "eissn" && !item.ISSN)) { - item.ISSN = value; - } else if(key == "rft.aulast") { - var lastCreator = item.creators[item.creators.length-1]; - if(item.creators.length && !lastCreator.lastName && !lastCreator.institutional) { - lastCreator.lastName = value; - } else { - item.creators.push({lastName:value}); - } - } else if(key == "rft.aufirst") { - var lastCreator = item.creators[item.creators.length-1]; - if(item.creators.length && !lastCreator.firstName && !lastCreator.institutional) { - lastCreator.firstName = value; - } else { - item.creators.push({firstName:value}); - } - } else if(key == "rft.au") { - item.creators.push(Scholar.cleanAuthor(value, "author", true)); - } else if(key == "rft.aucorp") { - item.creators.push({lastName:value, institutional:true}); - } else if(key == "rft.isbn" && !item.ISBN) { - item.ISBN = value; - } else if(key == "rft.pub") { - item.publisher = value; - } else if(key == "rft.place") { - item.place = value; - } else if(key == "rft.edition") { - item.edition = value; - } else if(key == "rft.series") { - item.seriesTitle = value; + for(var i in newItems) { + selectArray[i] = newItems.title; + } + selectArray = Scholar.selectItems(selectArray); + for(var i in selectArray) { + addAsItem(newItems[i]); } + } else if(newItems.length) { + addAsItem(newItems[0]); } - - return item; } function doWeb(doc, url) { var newItems = new Array(); + var needFullItems = new Array(); var spanTags = doc.getElementsByTagName("span"); @@ -2486,28 +2438,30 @@ function doWeb(doc, url) { var spanClasses = spanClass.split(" "); if(Scholar.Utilities.inArray("Z3988", spanClasses)) { var spanTitle = spanTags[i].getAttribute("title"); - if(spanTitle.indexOf("rft_val_fmt=info:ofi/fmt:kev:mtx:journal") != -1 - || spanTitle.indexOf("rft_val_fmt=info:ofi/fmt:kev:mtx:book") != -1) { - newItems.push(parseContextObject(spanTitle)); + var newItem = Scholar.Utilities.parseContextObject(spanTitle); + if(newItem) { + if(newItem.title && newItem.creators.length) { + // title and creators are minimum data to avoid looking up + newItems.push(newItem); + } else { + // retrieve full item + newItem.contextObject = spanTitle; + needFullItems.push(newItem); + } } } } } - if(newItems.length > 1) { - var selectArray = new Array(); - - for(var i in newItems) { - selectArray[i] = newItems.title; - } - selectArray = Scholar.selectItems(selectArray); - for(var i in selectArray) { - newItems[i].complete(); - } + if(needFullItems.length) { + // retrieve full items asynchronously + Scholar.wait(); + retrieveNextCOinS(needFullItems, newItems); } else { - newItems[0].complete(); + completeCOinS(newItems); } }'); + REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 4, 'Google Books Scraper', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)', 'function detect(doc, url) { var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i'');