commit a457cdb493334fb75a804ed962ea19f8b9ad4e8c
parent ddb754839ca615230fd4b756b82b3719e1ea7989
Author: Simon Kornblith <simon@simonster.com>
Date: Sat, 26 Aug 2006 07:27:02 +0000
added New York Times translator
Diffstat:
| M | scrapers.sql | | | 166 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--- |
1 file changed, 160 insertions(+), 6 deletions(-)
diff --git a/scrapers.sql b/scrapers.sql
@@ -703,17 +703,16 @@ function scrape(doc) {
}
}
- newItem.attachments.push({document:doc, title:"History Cooperative Full Text",
- downloadable:true});
-
- newItem.complete();
-
- // don''t actually need date info for a journal article
var month = metaTags.namedItem("PublicationMonth");
var year = metaTags.namedItem("PublicationYear");
if(month && year) {
newItem.date = month.getAttribute("content")+" "+year.getAttribute("content");
}
+
+ newItem.attachments.push({document:doc, title:"History Cooperative Full Text",
+ downloadable:true});
+
+ newItem.complete();
}
function doWeb(doc, url) {
@@ -3344,6 +3343,161 @@ function doWeb(doc, url) {
Scholar.wait();
}');
+REPLACE INTO "translators" VALUES ('ce7a3727-d184-407f-ac12-52837f3361ff', '2006-08-26 14:21:00', 4, 'New York Times', 'Simon Kornblith', '^(?:http://query.nytimes.com/search/query|http://www.nytimes.com/.+)',
+'function getList(urls, each, done) {
+ var url = urls.shift();
+ Scholar.Utilities.HTTP.doGet(url, function(text) {
+ if(each) {
+ each(text, url);
+ }
+
+ if(urls.length) {
+ getList(urls, each, done);
+ } else if(done) {
+ done(text);
+ }
+ });
+}
+
+function detectWeb(doc, url) {
+ if(doc.title.substr(0, 30) == "The New York Times: Search for") {
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ var result = doc.evaluate(''//div[@id="srchContent"]'', doc, nsResolver,
+ XPathResult.ANY_TYPE, null).iterateNext();
+ if(result) {
+ return "multiple";
+ }
+ } else {
+ var metaTags = doc.getElementsByTagName("meta");
+ if(metaTags.namedItem("hdl") && metaTags.namedItem("byl")) {
+ return "newspaperArticle";
+ }
+ }
+}',
+'function associateMeta(newItem, metaTags, field, scholarField) {
+ if(metaTags[field]) {
+ newItem[scholarField] = metaTags[field];
+ }
+}
+
+function scrape(doc, url) {
+ var newItem = new Scholar.Item("newspaperArticle");
+ newItem.publicationTitle = "The New York Times";
+ newItem.ISSN = "0362-4331";
+
+ var metaTags = new Object();
+ if(url != undefined) {
+ newItem.url = url;
+ var metaTagRe = /<meta[^>]*>/gi;
+ var nameRe = /name="([^"]+)"/i;
+ var contentRe = /content="([^"]+)"/i;
+ var m = doc.match(metaTagRe);
+
+ if(!m) {
+ return;
+ }
+
+ for(var i=0; i<m.length; i++) {
+ var name = nameRe.exec(m[i]);
+ var content = contentRe.exec(m[i]);
+ if(name && content) {
+ metaTags[name[1]] = content[1];
+ }
+ }
+
+ if(!metaTags["hdl"]) {
+ return;
+ }
+
+ newItem.attachments.push({url:url, title:"New York Times Article",
+ mimeType:"text/html", downloadable:true});
+ } else {
+ newItem.url = doc.location.href;
+ var metaTagHTML = doc.getElementsByTagName("meta");
+ for(var i=0; i<metaTagHTML.length; i++) {
+ var key = metaTagHTML[i].getAttribute("name");
+ var value = metaTagHTML[i].getAttribute("content");
+ if(key && value) {
+ metaTags[key] = value;
+ }
+ }
+
+ newItem.attachments.push({document:doc, title:"New York Times Article",
+ downloadable:true});
+ }
+
+ associateMeta(newItem, metaTags, "dat", "date");
+ associateMeta(newItem, metaTags, "hdl", "title");
+ associateMeta(newItem, metaTags, "dsk", "section");
+ associateMeta(newItem, metaTags, "articleid", "accessionNumber");
+
+ if(metaTags["byl"]) {
+ var author = metaTags["byl"];
+ if(author.substr(0, 3).toLowerCase() == "by ") {
+ author = author.substr(3);
+ }
+
+ var authors = author.split(" and ");
+ for each(var author in authors) {
+ // fix capitalization
+ var words = author.split(" ");
+ for(var i in words) {
+ words[i] = words[i][0].toUpperCase()+words[i].substr(1).toLowerCase();
+ }
+ author = words.join(" ");
+
+ if(words[0] == "The") {
+ newItem.creators.push({lastName:author, creatorType:"author"});
+ } else {
+ newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author"));
+ }
+ }
+ }
+
+ if(metaTags["keywords"]) {
+ var keywords = metaTags["keywords"];
+ newItem.tags = keywords.split(",");
+ for(var i in newItem.tags) {
+ newItem.tags[i] = newItem.tags[i].replace(" ", ", ");
+ }
+ }
+
+ newItem.complete();
+}
+
+function doWeb(doc, url) {
+ if(doc.title.substr(0, 30) == "The New York Times: Search for") {
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ var result = doc.evaluate(''//div[@id="srchContent"]'', doc, nsResolver,
+ XPathResult.ANY_TYPE, null).iterateNext();
+ var items = Scholar.Utilities.getItemArray(doc, result, ''^http://www.nytimes.com/.*\.html$'');
+ items = Scholar.selectItems(items);
+
+ if(!items) {
+ return true;
+ }
+
+ var urls = new Array();
+ for(var i in items) {
+ urls.push(i);
+ }
+
+ getList(urls, scrape, function() { Scholar.done(); }, null);
+
+ Scholar.wait();
+ } else {
+ scrape(doc);
+ }
+}');
+
REPLACE INTO "translators" VALUES ('e07e9b8c-0e98-4915-bb5a-32a08cb2f365', '2006-08-07 11:36:00', 8, 'Open WorldCat', 'Simon Kornblith', 'http://partneraccess.oclc.org/',
'function detectSearch(item) {
if(item.itemType == "book" || item.itemType == "bookSection") {