commit 6626eba844e847a2490b7508973d1a3abbf68e4e
parent e3d062a81968ed828d71bf451c1c311ce4742ff2
Author: Simon Kornblith <simon@simonster.com>
Date: Mon, 7 Aug 2006 05:15:30 +0000
addresses #83, figure out how to implement OpenURL
OpenURL lookup now works for books. this means that all that's necessary to add scrapable book metadata to a page is an ISBN, as shown below:
<span class="Z3988" title="ctx_ver=Z39.88-2004&rft_val_fmt=info:ofi/fmt:kev:mtx:book&rft.isbn=1579550088"></span>
also, we can now scrape Open WorldCat and Wikipedia Book Sources pages with no specialized code involved.
i'm still looking for a better way of looking up journal article metadata. it's currently implemented with CrossRef, but CrossRef simply will not work without a DOI, and is also incomplete (only holds the last name of the first author).
Diffstat:
3 files changed, 388 insertions(+), 132 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -147,6 +147,8 @@ Scholar.OpenURL = new function() {
this.resolve = resolve;
this.discoverResolvers = discoverResolvers;
this.createContextObject = createContextObject;
+ this.parseContextObject = parseContextObject;
+ this.lookupContextObject = lookupContextObject;
/*
* Returns a URL to look up an item in the OpenURL resolver
@@ -224,7 +226,7 @@ Scholar.OpenURL = new function() {
co += "&id="+escape(identifier);
}
} else {
- var co = "ctx_ver=Z39.88-2004";
+ var co = "url_ver=Z39.88-2004&ctx_ver=Z39.88-2004";
for each(identifier in identifiers) {
co += "&rft_id="+escape(identifier);
@@ -300,6 +302,298 @@ Scholar.OpenURL = new function() {
return co;
}
+ /*
+ * Generates an item in the format returned by item.fromArray() given an
+ * OpenURL version 1.0 contextObject
+ */
+ function parseContextObject(co) {
+ var coParts = co.split("&");
+
+ var item = new Array();
+ item.creators = new Array();
+
+ // get type
+ item.itemType = _determineResourceType(coParts);
+ if(!item.itemType) {
+ return false;
+ }
+
+ var pagesKey = "";
+
+ for each(part in coParts) {
+ var keyVal = part.split("=");
+ var key = keyVal[0];
+ var value = unescape(keyVal[1].replace(/\+|%2[bB]/g, " "));
+ if(!value) {
+ continue;
+ }
+
+ if(key == "rft_id") {
+ var firstEight = value.substr(0, 8).toLowerCase();
+ if(firstEight == "info:doi") {
+ item.DOI = value;
+ } else if(firstEight == "urn:isbn") {
+ item.ISBN = value.substr(9);
+ }
+ } else if(key == "rft.btitle") {
+ if(item.itemType == "book") {
+ item.title = value;
+ } else if(item.itemType == "bookSection") {
+ item.publicationTitle = value;
+ }
+ } else if(key == "rft.atitle" && item.itemType != "book") {
+ item.title = value;
+ } else if(key == "rft.jtitle" && item.itemType == "journal") {
+ item.publcation = value;
+ } else if(key == "rft.stitle" && item.itemType == "journal") {
+ item.journalAbbreviation = value;
+ } else if(key == "rft.date") {
+ item.date = value;
+ } else if(key == "rft.volume") {
+ item.volume = value;
+ } else if(key == "rft.issue") {
+ item.issue = value;
+ } else if(key == "rft.pages") {
+ pagesKey = key;
+ item.pages = value;
+ } else if(key == "rft.spage") {
+ if(pagesKey != "rft.pages") {
+ pagesKey = key;
+ // make pages look like start-end
+ if(pagesKey == "rft.epage") {
+ if(value != item.pages) {
+ item.pages = value+"-"+item.pages;
+ }
+ } else {
+ item.pages = value;
+ }
+ }
+ } else if(key == "rft.epage") {
+ if(pagesKey != "rft.pages") {
+ pagesKey = key;
+ // make pages look like start-end
+ if(pagesKey == "rft.spage") {
+ if(value != item.pages) {
+ item.pages = +item.pages+"-"+value;
+ }
+ } else {
+ item.pages = value;
+ }
+ }
+ } else if(key == "issn" || (key == "eissn" && !item.ISSN)) {
+ item.ISSN = value;
+ } else if(key == "rft.aulast") {
+ var lastCreator = item.creators[item.creators.length-1];
+ if(item.creators.length && !lastCreator.lastName && !lastCreator.institutional) {
+ lastCreator.lastName = value;
+ } else {
+ item.creators.push({lastName:value});
+ }
+ } else if(key == "rft.aufirst") {
+ var lastCreator = item.creators[item.creators.length-1];
+ if(item.creators.length && !lastCreator.firstName && !lastCreator.institutional) {
+ lastCreator.firstName = value;
+ } else {
+ item.creators.push({firstName:value});
+ }
+ } else if(key == "rft.au") {
+ item.creators.push(Scholar.cleanAuthor(value, "author", true));
+ } else if(key == "rft.aucorp") {
+ item.creators.push({lastName:value, institutional:true});
+ } else if(key == "rft.isbn" && !item.ISBN) {
+ item.ISBN = value;
+ } else if(key == "rft.pub") {
+ item.publisher = value;
+ } else if(key == "rft.place") {
+ item.place = value;
+ } else if(key == "rft.edition") {
+ item.edition = value;
+ } else if(key == "rft.series") {
+ item.seriesTitle = value;
+ }
+ }
+
+ return item;
+ }
+
+ /*
+ * Looks up additional information on an item in the format returned by
+ * item.fromArray() in CrossRef or Open WorldCat given an OpenURL version
+ * 1.0 contextObject
+ */
+ function lookupContextObject(co, done, error) {
+ // CrossRef requires a url_ver to work right
+ if(co.indexOf("url_ver=Z39.88-2004") == -1) {
+ co = "url_ver=Z39.88-2004&"+co;
+ }
+
+ var type = _determineResourceType(co.split("&"));
+ if(!type) {
+ return false;
+ }
+
+ if(type == "journal") {
+ // look up journals in CrossRef
+ Scholar.Utilities.HTTP.doGet("http://www.crossref.org/openurl/?"+co+"&noredirect=true", null, function(req) {
+ var items = _processCrossRef(req.responseText);
+ done(items);
+ });
+ } else {
+ // look up books in Open WorldCat
+ Scholar.Utilities.HTTP.processDocuments(null, ["http://partneraccess.oclc.org/wcpa/servlet/OpenUrl?"+co], function(browser) {
+ var doc = browser.contentDocument;
+ // find new COinS in the Open WorldCat page
+ items = _processOWC(doc);
+
+ if(items) { // we got a single item page; return the item
+ done(items);
+ } else { // assume we have a search results page
+ var items = new Array();
+
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace; else return null;
+ } : null;
+
+ // first try to get only books
+ var elmts = doc.evaluate('//table[@class="tableLayout"]/tbody/tr/td[@class="content"]/table[@class="tableResults"]/tbody/tr[td/img[@alt="Book"]]/td/div[@class="title"]/a', doc, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
+ var elmt = elmts.iterateNext();
+ if(!elmt) { // if that fails, look for other options
+ var elmts = doc.evaluate('//table[@class="tableLayout"]/tbody/tr/td[@class="content"]/table[@class="tableResults"]/tbody/tr[td/img[@alt="Book"]]/td/div[@class="title"]/a', doc, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null);
+ elmt = elmts.iterateNext()
+ }
+
+ var urlsToProcess = new Array();
+ do {
+ urlsToProcess.push(elmt.href);
+ } while(elmt = elmts.iterateNext());
+
+ Scholar.Utilities.HTTP.processDocuments(null, urlsToProcess, function(browser) {
+ // per URL
+ var newItems = _processOWC(browser.contentDocument);
+ if(newItems) {
+ items = items.concat(newItems);
+ }
+ }, function() { // done
+ done(items);
+ }, function() { // error
+ error();
+ });
+ }
+ }, null, function() {
+ error();
+ });
+ }
+ }
+
+ /*
+ * Processes the XML format returned by CrossRef
+ */
+ function _processCrossRef(xmlOutput) {
+ xmlOutput = xmlOutput.replace(/<\?xml[^>]*\?>/, "");
+
+ // parse XML with E4X
+ var qr = new Namespace("http://www.crossref.org/qrschema/2.0");
+ try {
+ var xml = new XML(xmlOutput);
+ } catch(e) {
+ return false;
+ }
+
+ // ensure status is valid
+ var status = xml.qr::body.qr::query.@status.toString();
+ if(status != "resolved" && status != "multiresolved") {
+ return false;
+ }
+
+ var query = xml.qr::body.qr::query;
+ var item = new Array();
+ item.creators = new Array();
+
+ // try to get a DOI
+ item.DOI = query.qr::doi.(@type=="journal_article").toString();
+ if(!item.DOI) {
+ item.DOI = query.qr::doi.(@type=="book_title").toString();
+ }
+ if(!item.DOI) {
+ item.DOI = query.qr::doi.(@type=="book_content").toString();
+ }
+
+ // try to get an ISSN (no print/electronic preferences)
+ item.ISSN = query.qr::issn.toString();
+ // get title
+ item.title = query.qr::article_title.toString();
+ // get publicationTitle
+ item.publicationTitle = query.qr::journal_title.toString();
+ // get author
+ item.creators.push(Scholar.Utilities.cleanAuthor(query.qr::author.toString(), "author", true));
+ // get volume
+ item.volume = query.qr::volume.toString();
+ // get issue
+ item.issue = query.qr::issue.toString();
+ // get year
+ item.date = query.qr::year.toString();
+ // get edition
+ item.edition = query.qr::edition_number.toString();
+ // get first page
+ item.pages = query.qr::first_page.toString();
+
+ return [item];
+ }
+
+ /*
+ * Parses a document object referring to an Open WorldCat entry for its
+ * OpenURL contextObject, then returns an item generated from this
+ * contextObject
+ */
+ function _processOWC(doc) {
+ var spanTags = doc.getElementsByTagName("span");
+ for(var i=0; i<spanTags.length; i++) {
+ var spanClass = spanTags[i].getAttribute("class");
+ if(spanClass) {
+ var spanClasses = spanClass.split(" ");
+ if(Scholar.inArray("Z3988", spanClasses)) {
+ var spanTitle = spanTags[i].getAttribute("title");
+ var item = parseContextObject(spanTitle);
+ if(item) {
+ return [item];
+ } else {
+ return false;
+ }
+ }
+ }
+ }
+
+ return false;
+ }
+
+ /*
+ * Determines the type of an OpenURL contextObject
+ */
+ function _determineResourceType(coParts) {
+ // determine resource type
+ var type = false;
+ for(var i in coParts) {
+ if(coParts[i].substr(0, 12) == "rft_val_fmt=") {
+ var format = unescape(coParts[i].substr(12));
+ if(format == "info:ofi/fmt:kev:mtx:journal") {
+ var type = "journal";
+ } else if(format == "info:ofi/fmt:kev:mtx:book") {
+ if(Scholar.inArray("rft.genre=bookitem", coParts)) {
+ var type = "bookSection";
+ } else {
+ var type = "book";
+ }
+ break;
+ }
+ }
+ }
+ return type;
+ }
+
+ /*
+ * Used to map tags for generating OpenURL contextObjects
+ */
function _mapTag(data, tag, version) {
if(data) {
if(version == "0.1") {
diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js
@@ -317,6 +317,14 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe,
return availableItems;
}
+Scholar.Utilities.Ingester.prototype.lookupContextObject = function(co, done, error) {
+ return Scholar.OpenURL.lookupContextObject(co, done, error);
+}
+
+Scholar.Utilities.Ingester.prototype.parseContextObject = function(co) {
+ return Scholar.OpenURL.parseContextObject(co);
+}
+
/*
* END SCHOLAR FOR FIREFOX EXTENSIONS
*/
diff --git a/scrapers.sql b/scrapers.sql
@@ -1,7 +1,7 @@
--- 37
+-- 38
-- Set the following timestamp to the most recent scraper update date
-REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-06 21:45:00'));
+REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-07 01:09:00'));
REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 4, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)',
'function detect(doc, url) {
@@ -2332,7 +2332,7 @@ REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006
translator.doImport();
}');
-REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006-08-06 19:14:00', 4, 'COinS Scraper', 'Simon Kornblith', NULL,
+REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006-08-07 01:09:00', 4, 'COinS Scraper', 'Simon Kornblith', NULL,
'function detect(doc, url) {
var spanTags = doc.getElementsByTagName("span");
@@ -2345,18 +2345,31 @@ REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006
if(Scholar.Utilities.inArray("Z3988", spanClasses)) {
var spanTitle = spanTags[i].getAttribute("title");
- if(spanTitle.indexOf("rft_val_fmt=info:ofi/fmt:kev:mtx:journal") != -1) {
- var type = "journal";
- } else if(spanTitle.indexOf("rft_val_fmt=info:ofi/fmt:kev:mtx:book") != -1) {
- var type = "book";
- } else {
- continue;
+ // determine if it''s a valid type
+ var coParts = spanTitle.split("&");
+ var type = null
+ for(var i in coParts) {
+ if(coParts[i].substr(0, 12) == "rft_val_fmt=") {
+ var format = unescape(coParts[i].substr(12));
+ if(format == "info:ofi/fmt:kev:mtx:journal") {
+ var type = "journal";
+ } else if(format == "info:ofi/fmt:kev:mtx:book") {
+ if(Scholar.Utilities.inArray("rft.genre=bookitem", coParts)) {
+ var type = "bookSection";
+ } else {
+ var type = "book";
+ }
+ break;
+ }
+ }
}
- if(encounteredType) {
- return "multiple";
- } else {
- encounteredType = type;
+ if(type) {
+ if(encounteredType) {
+ return "multiple";
+ } else {
+ encounteredType = type;
+ }
}
}
}
@@ -2364,119 +2377,58 @@ REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006
return encounteredType;
}',
-'function parseContextObject(co) {
- if(co.indexOf("rft_val_fmt=info:ofi/fmt:kev:mtx:journal") != -1) {
- var type = "journal";
+'// used to retrieve next COinS object when asynchronously parsing COinS objects
+// on a page
+function retrieveNextCOinS(needFullItems, newItems) {
+ if(needFullItems.length) {
+ var item = needFullItems.shift();
+
+ Scholar.Utilities.debugPrint("looking up contextObject");
+ Scholar.Utilities.lookupContextObject(item.contextObject, function(items) {
+ Scholar.Utilities.debugPrint(items);
+ if(items) {
+ newItems = newItems.concat(items);
+ }
+ retrieveNextCOinS(needFullItems, newItems);
+ }, function() {
+ Scholar.done(false);
+ });
} else {
- if(co.indexOf("rft.genre=bookitem") != -1) {
- var type = "bookSection";
- } else {
- var type = "book"
- }
+ completeCOinS(newItems);
+ Scholar.done(true);
}
- var item = new Scholar.Item(type);
-
- var pagesKey = "";
-
- var coParts = co.split("&");
- for each(part in coParts) {
- var keyVal = part.split("=");
- var key = keyVal[0];
- var value = unescape(keyVal[1].replace(/\+/g, " "));
- if(!value) {
- continue;
- }
+}
+
+// attaches item data to a new Scholar.Item instance (because data returned from
+// Scholar.OpenURL.processContextObject does not have a complete() method)
+function addAsItem(itemArray) {
+ var newItem = new Scholar.Item();
+ for(var i in itemArray) {
+ newItem[i] = itemArray[i];
+ }
+ newItem.complete();
+}
+
+// saves all COinS objects
+function completeCOinS(newItems) {
+ if(newItems.length > 1) {
+ var selectArray = new Array();
- if(key == "rft_id") {
- var firstEight = value.substr(0, 8).toLowerCase();
- if(firstEight == "info:doi") {
- item.DOI = value;
- } else if(firstEight == "urn:isbn") {
- item.ISBN = value.substr(9);
- }
- } else if(key == "rft.btitle") {
- if(item.itemType == "book") {
- item.title = value;
- } else if(item.itemType == "bookSection") {
- item.publicationTitle = value;
- }
- } else if(key == "rft.atitle" && item.itemType != "book") {
- item.title = value;
- } else if(key == "rft.jtitle" && item.itemType == "journal") {
- item.publcation = value;
- } else if(key == "rft.stitle" && item.itemType == "journal") {
- item.journalAbbreviation = value;
- } else if(key == "rft.date") {
- item.date = value;
- } else if(key == "rft.volume") {
- item.volume = value;
- } else if(key == "rft.issue") {
- item.issue = value;
- } else if(key == "rft.pages") {
- pagesKey = key;
- item.pages = value;
- } else if(key == "rft.spage") {
- if(pagesKey != "rft.pages") {
- pagesKey = key;
- // make pages look like start-end
- if(pagesKey == "rft.epage") {
- if(value != item.pages) {
- item.pages = value+"-"+item.pages;
- }
- } else {
- item.pages = value;
- }
- }
- } else if(key == "rft.epage") {
- if(pagesKey != "rft.pages") {
- pagesKey = key;
- // make pages look like start-end
- if(pagesKey == "rft.spage") {
- if(value != item.pages) {
- item.pages = +item.pages+"-"+value;
- }
- } else {
- item.pages = value;
- }
- }
- } else if(key == "issn" || (key == "eissn" && !item.ISSN)) {
- item.ISSN = value;
- } else if(key == "rft.aulast") {
- var lastCreator = item.creators[item.creators.length-1];
- if(item.creators.length && !lastCreator.lastName && !lastCreator.institutional) {
- lastCreator.lastName = value;
- } else {
- item.creators.push({lastName:value});
- }
- } else if(key == "rft.aufirst") {
- var lastCreator = item.creators[item.creators.length-1];
- if(item.creators.length && !lastCreator.firstName && !lastCreator.institutional) {
- lastCreator.firstName = value;
- } else {
- item.creators.push({firstName:value});
- }
- } else if(key == "rft.au") {
- item.creators.push(Scholar.cleanAuthor(value, "author", true));
- } else if(key == "rft.aucorp") {
- item.creators.push({lastName:value, institutional:true});
- } else if(key == "rft.isbn" && !item.ISBN) {
- item.ISBN = value;
- } else if(key == "rft.pub") {
- item.publisher = value;
- } else if(key == "rft.place") {
- item.place = value;
- } else if(key == "rft.edition") {
- item.edition = value;
- } else if(key == "rft.series") {
- item.seriesTitle = value;
+ for(var i in newItems) {
+ selectArray[i] = newItems.title;
+ }
+ selectArray = Scholar.selectItems(selectArray);
+ for(var i in selectArray) {
+ addAsItem(newItems[i]);
}
+ } else if(newItems.length) {
+ addAsItem(newItems[0]);
}
-
- return item;
}
function doWeb(doc, url) {
var newItems = new Array();
+ var needFullItems = new Array();
var spanTags = doc.getElementsByTagName("span");
@@ -2486,28 +2438,30 @@ function doWeb(doc, url) {
var spanClasses = spanClass.split(" ");
if(Scholar.Utilities.inArray("Z3988", spanClasses)) {
var spanTitle = spanTags[i].getAttribute("title");
- if(spanTitle.indexOf("rft_val_fmt=info:ofi/fmt:kev:mtx:journal") != -1
- || spanTitle.indexOf("rft_val_fmt=info:ofi/fmt:kev:mtx:book") != -1) {
- newItems.push(parseContextObject(spanTitle));
+ var newItem = Scholar.Utilities.parseContextObject(spanTitle);
+ if(newItem) {
+ if(newItem.title && newItem.creators.length) {
+ // title and creators are minimum data to avoid looking up
+ newItems.push(newItem);
+ } else {
+ // retrieve full item
+ newItem.contextObject = spanTitle;
+ needFullItems.push(newItem);
+ }
}
}
}
}
- if(newItems.length > 1) {
- var selectArray = new Array();
-
- for(var i in newItems) {
- selectArray[i] = newItems.title;
- }
- selectArray = Scholar.selectItems(selectArray);
- for(var i in selectArray) {
- newItems[i].complete();
- }
+ if(needFullItems.length) {
+ // retrieve full items asynchronously
+ Scholar.wait();
+ retrieveNextCOinS(needFullItems, newItems);
} else {
- newItems[0].complete();
+ completeCOinS(newItems);
}
}');
+
REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 4, 'Google Books Scraper', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)',
'function detect(doc, url) {
var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i'');