commit 68c2a0039ab8fda226160702d22aff32906061f5
parent 9d78bd70241e3b62a7dee52ad28654b64913b544
Author: Avram Lyon <ajlyon@gmail.com>
Date: Mon, 21 Mar 2011 13:03:37 +0000
Trans: Changes to NYT: Use standard date when available, grab single page snapshot
Diffstat:
1 file changed, 25 insertions(+), 5 deletions(-)
diff --git a/translators/NYTimes.com.js b/translators/NYTimes.com.js
@@ -8,7 +8,7 @@
"maxVersion":"",
"priority":100,
"inRepository":true,
- "lastUpdated":"2011-01-11 04:31:00"
+ "lastUpdated":"2011-03-21 04:31:00"
}
function detectWeb(doc, url) {
@@ -38,6 +38,11 @@ function associateMeta(newItem, metaTags, field, zoteroField) {
}
function scrape(doc, url) {
+ var namespace = null;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace; else return null;
+ } : null;
+
var newItem = new Zotero.Item("newspaperArticle");
newItem.publicationTitle = "The New York Times";
newItem.ISSN = "0362-4331";
@@ -65,8 +70,8 @@ function scrape(doc, url) {
if(!metaTags["hdl"]) {
return;
}
-
- newItem.attachments.push({url:url, title:"New York Times Snapshot",
+ // We want to get everything on one page
+ newItem.attachments.push({url:url.replace(/\.html\??([^/]*)(pagewanted=[^&]*)?([^/]*)$/,".html?pagewanted=all&$1$2"), title:"New York Times Snapshot",
mimeType:"text/html"});
} else {
newItem.url = doc.location.href;
@@ -78,8 +83,16 @@ function scrape(doc, url) {
metaTags[key] = value;
}
}
-
- newItem.attachments.push({document:doc, title:"New York Times Snapshot"});
+ // Get everything on one page is possible
+ var singlePage = false;
+ if (!newItem.url.match(/\?pagewanted=all/)
+ && (singlePage = doc.evaluate('//ul[@id="toolsList"]/li[@class="singlePage"]/a', doc, nsResolver,
+ XPathResult.ANY_TYPE, null).iterateNext())) {
+ newItem.attachments.push({url:singlePage.href, title:"New York Times Snapshot",
+ mimeType:"text/html"});
+ } else {
+ newItem.attachments.push({document:doc, title:"New York Times Snapshot"});
+ }
}
associateMeta(newItem, metaTags, "dat", "date");
@@ -87,6 +100,10 @@ function scrape(doc, url) {
associateMeta(newItem, metaTags, "dsk", "section");
associateMeta(newItem, metaTags, "articleid", "accessionNumber");
+ if (metaTags["pdate"]) {
+ newItem.date = metaTags["pdate"].replace(/(\d{4})(\d{2})(\d{2})/,"$1-$2-$3");
+ }
+
if(metaTags["byl"]) {
var author = Zotero.Utilities.trimInternal(metaTags["byl"]);
if(author.substr(0, 3).toLowerCase() == "by ") {
@@ -118,6 +135,9 @@ function scrape(doc, url) {
}
}
+ // Remove pagewanted from URL in item (keeping other pieces, in case they might matter)
+ newItem.url = newItem.url.replace(/\?([^/]*)pagewanted=[^&]*/,'');
+
newItem.complete();
}