commit 88d987c6ffc43b3778731a3e4f9492ed9779b57a
parent ba381943e1ccea528b4c242d92b1e3fbac09d757
Author: Matt Burton <mcburton@gmail.com>
Date: Mon, 29 Mar 2010 15:01:08 +0000
Adding new translators from Frank.
Diffstat:
2 files changed, 281 insertions(+), 0 deletions(-)
diff --git a/translators/Japan Times Online.js b/translators/Japan Times Online.js
@@ -0,0 +1,160 @@
+{
+ "translatorID":"b56d756e-934e-4b46-bc58-d61dccc9f32f",
+ "translatorType":4,
+ "label":"Japan Times Online",
+ "creator":"Frank Bennett",
+ "target":"^http://(?:www|search)\\.japantimes\\.co\\.jp/(?:cgi-bin|gsearch|features|entertainment|sports|life|news)",
+ "minVersion":"2.0b7",
+ "maxVersion":"",
+ "priority":100,
+ "inRepository":true,
+ "lastUpdated":"2009-01-23 02:17:09"
+}
+
+// #################################
+// #### Local utility functions ####
+// #################################
+
+var itemRe = new RegExp('^http://search\.japantimes\.co\.jp/cgi-bin/[a-z]{2}[0-9]{8}[a-z0-9]{2}\.html');
+
+var getResolver = function (doc) {
+ var namespace, resolver;
+ namespace = doc.documentElement.namespaceURI;
+ if (namespace) {
+ resolver = function(prefix) {
+ if (prefix == 'x') {
+ return namespace;
+ } else {
+ return null;
+ }
+ };
+ } else {
+ resolver = null;
+ }
+ return resolver;
+};
+
+var getTagContent = function (txt, attribute, value) {
+ var ret, m, rex;
+ ret = false;
+ rex = RegExp("<[^>]*" + attribute + "=\"" + value + "\"[^>]*>([^<]*)<");
+ m = rex.exec(txt);
+ if (m) {
+ ret = m[1];
+ }
+ return ret;
+}
+
+var getTagsWithAttributeAndContent = function (txt, tag, attribute) {
+ var ret, pos, len, lst, m, tagsrex, attribrex;
+ ret = {};
+ tagsrex = RegExp("(<" + tag + "(?: [^>]*>|>)|</" + tag+ ">)");
+ attribrex = RegExp(' ' + attribute + '="([^"]+)"');
+ lst = txt.split(tagsrex);
+ if (lst.length > 1) {
+ len = lst.length;
+ for (pos=1; pos < len; pos += 4) {
+ if (pos < (len - 2) && lst[pos + 2] == ("</" + tag + ">")) {
+ m = lst[pos].match(attribrex);
+ if (m) {
+ if (!itemRe.exec(m[1])) {
+ continue;
+ }
+ var title = lst[pos + 1];
+ title = title.replace(/\|.*/, "").replace(/<[^>]+>/g, "");;
+ ret[m[1]] = Zotero.Utilities.unescapeHTML(title);
+ }
+ }
+ }
+ }
+ return ret;
+}
+
+// #########################
+// ##### API functions #####
+// #########################
+
+var detectWeb = function (doc, url) {
+ if (itemRe.test(doc.location.href)) {
+ return "newspaperArticle";
+ } else {
+ return "multiple";
+ }
+}
+
+var doWeb = function (doc, url) {
+ var type, nsResolver, availableItems, xpath, found, nodes, headline, pos, myurl, m, items;
+ nsResolver = getResolver(doc);
+ type = detectWeb(doc, url);
+ if (type === "multiple") {
+ availableItems = {};
+ if (url.match(/\/gsearch\//)) {
+ //
+ // For Google SafeSearch. Thanks, guys, it was an entertaining afternoon.
+ //
+ xpath = '//iframe[@name="googleSearchFrame"]';
+ var iframe = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ var address = iframe.src;
+ var page = Zotero.Utilities.retrieveSource(address);
+ availableItems = getTagsWithAttributeAndContent(page, "a", "href");
+ } else {
+ xpath = '//a[contains(@href, "cgi-bin")]';
+ nodes = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ found = nodes.iterateNext();
+ while (found) {
+ if (!itemRe.test(found)) {
+ found = nodes.iterateNext();
+ continue;
+ }
+ headline = found.text;
+ //
+ // Some headlines have a weird structure that yields two
+ // entries, the second of which is blank. Nothing is lost
+ // by this construct.
+ //
+ if (!headline.replace("\n", "")) {
+ found = nodes.iterateNext();
+ continue;
+ }
+ headline = headline.replace("\u00a0", " ", "g").replace("\n", " ", "g");
+ headline = headline.replace(/^\s+/, "").replace(/\s+$/, "").replace(/\s+/g, " ");
+ availableItems[found.href] = headline;
+ found = nodes.iterateNext();
+ }
+ }
+ if (availableItems.__count__) {
+ items = Zotero.selectItems(availableItems);
+ for (myurl in items) {
+ if (items.hasOwnProperty(myurl)) {
+ scrapeAndParse(myurl);
+ }
+ }
+ }
+ } else if (type === "newspaperArticle") {
+ scrapeAndParse(url);
+ }
+};
+
+// ############################
+// ##### Scraper function #####
+// ############################
+
+var scrapeAndParse = function (url) {
+ var item, mytxt, m, val;
+ item = new Zotero.Item("newspaperArticle");
+
+ mytxt = Zotero.Utilities.retrieveSource(url);
+
+ item.publicationTitle = "Japan Times Online";
+ item.url = url;
+ val = getTagContent(mytxt, "id", "date");
+ if (val) {
+ item.date = val;
+ }
+ val = getTagContent(mytxt, "id", "headline");
+ if (val) {
+ item.title = val;
+ }
+ item.attachments.push({title:"Japan Times Online snapshot", mimeType:"text/html", url:url});
+ item.complete();
+};
diff --git a/translators/Mainichi Daily News.js b/translators/Mainichi Daily News.js
@@ -0,0 +1,121 @@
+{
+ "translatorID":"b56f856e-934e-4b46-bc58-d61dccc9f32f",
+ "translatorType":4,
+ "label":"Mainichi Daily News",
+ "creator":"Frank Bennett",
+ "target":"^http://(?:search\\.)*mdn\\.mainichi\\.jp/(?:$|result\?|mdnnews/|perspectives/|features/|arts/|travel/)",
+ "minVersion":"2.0b7",
+ "maxVersion":"",
+ "priority":100,
+ "inRepository":true,
+ "lastUpdated":"2009-01-23 02:17:09"
+}
+
+// #################################
+// #### Local utility functions ####
+// #################################
+
+var itemRe = new RegExp('.*/([0-9]{8})[a-z]{1}[0-9]{1}[a-z]{1}[0-9]{2}[a-z]{1}[0-9]{1}[a-z]{2}[0-9]{6}c\.html');
+
+var getResolver = function (doc) {
+ var namespace, resolver;
+ namespace = doc.documentElement.namespaceURI;
+ if (namespace) {
+ resolver = function(prefix) {
+ if (prefix == 'x') {
+ return namespace;
+ } else {
+ return null;
+ }
+ };
+ } else {
+ resolver = null;
+ }
+ return resolver;
+};
+
+var cleanUp = function (str) {
+ var ret;
+ ret = str.replace("\u00a0", " ", "g").replace("\n", " ", "g");
+ ret = ret.replace(/^\s+/, "").replace(/\s+$/, "").replace(/\s+/g, " ");
+ ret = ret.replace(/\|.*/, "").replace(/<[^>]+>/g, "");;
+ ret = Zotero.Utilities.unescapeHTML(ret);
+ return ret;
+}
+
+
+// #########################
+// ##### API functions #####
+// #########################
+
+var detectWeb = function (doc, url) {
+ if (itemRe.test(doc.location.href)) {
+ return "newspaperArticle";
+ } else {
+ return "multiple";
+ }
+}
+
+var doWeb = function (doc, url) {
+ var type, nsResolver, availableItems, xpath, found, nodes, headline, pos, myurl, m, items, title;
+ nsResolver = getResolver(doc);
+ type = detectWeb(doc, url);
+ if (type === "multiple") {
+ availableItems = {};
+ if (url.match(/^http:\/\/search\.mdn\.mainichi\.jp\/result\?/)){
+ xpath = '//div[@class="ResultTitle"]/a[contains(@href, "mdn.mainichi.jp")]';
+ } else {
+ xpath = '//h2[@class="NewsTitle"]/a[@href]|//ul[@class="Mark"]/li/a[@href]';
+ }
+ nodes = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ found = nodes.iterateNext();
+ while (found) {
+ if (!itemRe.test(found.href)) {
+ found = nodes.iterateNext();
+ continue;
+ }
+ headline = found.textContent;
+ headline = cleanUp(headline);
+ availableItems[found.href] = headline;
+ found = nodes.iterateNext();
+ }
+ if (availableItems.__count__) {
+ items = Zotero.selectItems(availableItems);
+ for (myurl in items) {
+ if (items.hasOwnProperty(myurl)) {
+ scrapeAndParse(myurl, availableItems[myurl]);
+ }
+ }
+ }
+ } else if (type === "newspaperArticle") {
+ xpath = '//h2[@class="NewsTitle"]';
+ nodes = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ title = nodes.iterateNext();
+ if (title) {
+ title = cleanUp(title.textContent);
+ scrapeAndParse(url, title);
+ }
+ }
+};
+
+// ############################
+// ##### Scraper function #####
+// ############################
+
+var scrapeAndParse = function (url, title) {
+ var item, mytxt, m, val;
+ item = new Zotero.Item("newspaperArticle");
+ item.title = title;
+ item.publicationTitle = "Mainichi Daily News";
+ item.edition = "online edition";
+ item.url = url;
+ m = itemRe.exec(url);
+ if (m) {
+ var year = m[1].slice(0,4);
+ var month = m[1].slice(4,6);
+ var day = m[1].slice(6,8);
+ item.date = [year, month, day].join("-");
+ }
+ item.attachments.push({title:"Mainichi Daily News snapshot", mimeType:"text/html", url:url});
+ item.complete();
+};