commit 2f64ff78ea5f3b8cd291c31499e988b6ddbdf594
parent 536f1ee24e55fa9bff249b378bcdbf91455daa5d
Author: Avram Lyon <ajlyon@gmail.com>
Date: Tue, 14 Sep 2010 15:06:21 +0000
New translator for NZ's Papers Past, by staplegun
Diffstat:
1 file changed, 171 insertions(+), 0 deletions(-)
diff --git a/translators/Papers Past.js b/translators/Papers Past.js
@@ -0,0 +1,171 @@
+{
+ "translatorID":"1b052690-16dd-431d-9828-9dc675eb55f6",
+ "label":"Papers Past",
+ "creator":"staplegun",
+ "target":"^http://paperspast\\.natlib\\.govt\\.nz",
+ "minVersion":"1.0",
+ "maxVersion":"",
+ "priority":100,
+ "inRepository":"1",
+ "translatorType":4,
+ "lastUpdated":"2010-09-14 19:04:32"
+}
+
+/*
+ Papers Past Translator - Parses historic digitised newspaper articles and creates Zotero-based metadata
+ Copyright (C) 2010 staplegun
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+*/
+
+function detectWeb(doc, url) {
+
+ // a results parameter in URL means search hitlist
+ if (url.match(/results=/) ) {
+ return "multiple";
+
+ } else {
+
+ // init variables
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == "x" ) return namespace; else return null;
+ } : null;
+ var myXPath;
+ var myXPathObject;
+
+ // publication title in meta tags means have an article
+ myXPath = '//meta[@name="newsarticle_publication"]/@content';
+ myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ var meta = myXPathObject.iterateNext().textContent;
+ if (meta.length > 0) {
+ return "newspaperArticle";
+ }
+ }
+}
+
+function doWeb(doc, url) {
+
+ // init variables
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == "x" ) return namespace; else return null;
+ } : null;
+
+ // hitlist page: compile hitlist titles, user selects which are wanted
+ // (add &zto=1 to URL for usage tracking)
+ var articles = new Array();
+ if (detectWeb(doc, url) == "multiple") {
+ var titlesXPath = '//div[@class="search-results"]/p/a';
+ var titles = doc.evaluate(titlesXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ var nextTitle;
+ var items = new Array();
+ while (nextTitle = titles.iterateNext()) {
+ items[nextTitle.href+"&zto=1"] = nextTitle.textContent;
+ }
+ // presented to user - who reduces list to those selected
+ items = Zotero.selectItems(items);
+ // transfer this list to articles array
+ for (var i in items) {
+ articles.push(i);
+ }
+
+ // article page: just continue with single (current) page URL
+ } else {
+ articles = [url+"&zto=1"];
+ }
+
+ // process each selected article page URL
+ Zotero.Utilities.processDocuments(articles, scrape, function(){Zotero.done();});
+ Zotero.wait();
+}
+
+function scrape(doc) {
+
+ // init variables
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == "x" ) return namespace; else return null;
+ } : null;
+ var myXPath;
+ var myXPathObject;
+
+ // basic item details
+ var newItem = new Zotero.Item('newspaperArticle');
+ newItem.url = doc.location.href;
+ newItem.archive = 'Papers Past';
+
+ // publication title
+ myXPath = '//meta[@name="newsarticle_publication"]/@content';
+ myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ newItem.publicationTitle = myXPathObject.iterateNext().textContent;
+ Zotero.debug(newItem.publicationTitle);
+
+ // article title (convert to sentence case)
+ // NB: THE CONVERSION SEEMS TO FAIL IF HAS SPECIAL CHARS
+ myXPath = '//meta[@name="newsarticle_headline"]/@content';
+ myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ var title = myXPathObject.iterateNext().textContent;
+ var words = title.split(/\s/);
+ var titleFixed = '';
+ for (var i in words) {
+ words[i] = words[i][0].toUpperCase() + words[i].substr(1).toLowerCase();
+ titleFixed = titleFixed + words[i] + ' ';
+ }
+ titleFixed = Zotero.Utilities.trim(titleFixed);
+ newItem.title = titleFixed;
+
+ // publication date (is preformatted to ISO 8601)
+ myXPath = '//meta[@name="dc_date"]/@content';
+ myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ newItem.date = myXPathObject.iterateNext().textContent;
+
+ // pagination
+ myXPath = '//meta[@name="newsarticle_firstpage"]/@content';
+ myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ var pages = myXPathObject.iterateNext().textContent;
+
+ myXPath = '//meta[@name="newsarticle_otherpages"]/@content';
+ myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ pages = pages + ' ' + myXPathObject.iterateNext().textContent;
+
+ newItem.pages = Zotero.Utilities.trim(pages);
+
+ // save copy of entire web page as attachment
+ var attachments = new Array();
+ attachments.push({
+ title:titleFixed + " : Article webpage",
+ mimeType:"text/html",
+ url:doc.location.href
+ });
+
+ // find image scans and add as attachments
+ myXPath = '//img[@class="veridianimage"]/@src';
+ myXPathObject = doc.evaluate(myXPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ var imgSrc;
+ var imgUrl;
+ var imgNo = 0;
+ while (imgSrc = myXPathObject.iterateNext() ) {
+ imgUrl = "http://paperspast.natlib.govt.nz" + imgSrc.textContent;
+ attachments.push({
+ title: titleFixed + " : Scan image part " + ++imgNo,
+ mimeType: "image/gif",
+ url: imgUrl
+ });
+ }
+ newItem.attachments = attachments;
+
+ // finish
+ newItem.complete();
+}