commit 72da6e412e7f96d698bc40f86fa298b28e886e07
parent 4b48ff041426d3312925b435d05bd6021e73d3bf
Author: Simon Kornblith <simon@simonster.com>
Date: Sat, 26 Aug 2006 05:51:41 +0000
add Google Scholar translator
Diffstat:
| M | scrapers.sql | | | 111 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++- |
1 file changed, 110 insertions(+), 1 deletion(-)
diff --git a/scrapers.sql b/scrapers.sql
@@ -1,4 +1,4 @@
--- 57
+-- 58
-- Set the following timestamp to the most recent scraper update date
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-15 15:42:00'));
@@ -2995,6 +2995,115 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006
Scholar.wait();
}');
+REPLACE INTO "translators" VALUES ('57a00950-f0d1-4b41-b6ba-44ff0fc30289', '2006-08-26 1:10:00', 4, 'Google Scholar', 'Simon Kornblith', '^http://scholar\.google\.com/scholar',
+'function detectWeb(doc, url) {
+ return "multiple";
+}',
+'function getList(urls, each, done) {
+ var url = urls.shift();
+ Scholar.Utilities.HTTP.doGet(url, function(text) {
+ if(each) {
+ each(text);
+ }
+
+ if(urls.length) {
+ getList(urls, each, done);
+ } else if(done) {
+ done(text);
+ }
+ });
+}
+
+function doWeb(doc, url) {
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ var items = new Array();
+ var relatedLinks = new Array();
+ var links = new Array();
+ var types = new Array();
+
+ var itemTypes = new Array();
+ var attachments = new Array();
+
+ var elmts = doc.evaluate(''//p[@class="g"]'', doc, nsResolver,
+ XPathResult.ANY_TYPE, null);
+ var elmt;
+ var i=0;
+ while(elmt = elmts.iterateNext()) {
+ var isCitation = doc.evaluate("./font[1]/b[1]/text()[1]", elmt, nsResolver,
+ XPathResult.ANY_TYPE, null).iterateNext();
+ var relatedLink = doc.evaluate(''.//a[font/text() = "Related Articles"]'',
+ elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if(relatedLink) {
+ relatedLinks[i] = relatedLink.href;
+ if(isCitation && isCitation.nodeValue == "[CITATION]") {
+ items[i] = Scholar.Utilities.getNodeString(doc, elmt, ''./text()|./b/text()'', nsResolver);
+ } else if(isCitation && isCitation.nodeValue == "[BOOK]") {
+ items[i] = Scholar.Utilities.getNodeString(doc, elmt, ''./text()|./b/text()'', nsResolver);
+ types[i] = "book";
+ } else {
+ var link = doc.evaluate(''.//span[@class="w"]/a'', elmt, nsResolver,
+ XPathResult.ANY_TYPE, null).iterateNext();
+ if(link) {
+ items[i] = link.textContent;
+ links[i] = link.href;
+ }
+ }
+
+ if(items[i]) {
+ i++;
+ }
+ }
+ }
+
+ items = Scholar.selectItems(items);
+ if(!items) {
+ return true;
+ }
+
+ var relatedMatch = /[&?]q=related:([^&]+)/;
+
+ var urls = new Array();
+ for(var i in items) {
+ var m = relatedMatch.exec(relatedLinks[i]);
+ urls.push("http://scholar.google.com/scholar.ris?hl=en&lr=&q=info:"+m[1]+"&output=citation&oi=citation");
+ if(links[i]) {
+ attachments.push([{title:"Google Scholar Linked Page", type:"text/html",
+ url:links[i]}]);
+ } else {
+ attachments.push([]);
+ }
+
+ if(types[i]) { // for books
+ itemTypes.push(types[i]);
+ } else {
+ itemTypes.push(null);
+ }
+ }
+
+ var translator = Scholar.loadTranslator("import");
+ translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7");
+ translator.setHandler("itemDone", function(obj, item) {
+ var itemType = itemTypes.shift();
+ if(itemType) {
+ item.itemType = itemType;
+ }
+
+ item.attachments = attachments.shift();
+ item.complete();
+ });
+
+ getList(urls, function(text) {
+ translator.setString(text);
+ translator.translate();
+ }, function() { Scholar.done() });
+
+ Scholar.wait();
+}');
+
REPLACE INTO "translators" VALUES ('9c335444-a562-4f88-b291-607e8f46a9bb', '2006-08-15 15:42:00', 4, 'Berkeley Library', 'Simon Kornblith', '^http://[^/]*berkeley.edu[^/]*/WebZ/(?:html/results.html|FETCH)\?.*sessionid=',
'function detectWeb(doc, url) {
var resultsRegexp = /\/WebZ\/html\/results.html/i