Trans: Updated Globe translator, by Frank Bennett. - www - Unnamed repository; edit this file 'description' to name the repository.

commit 107632d97048314fb495dcd339bdf393acec95c7
parent fd209f98dc8a09444ee913e58788f29ad7a89fea
Author: Avram Lyon <ajlyon@gmail.com>
Date:   Fri,  6 May 2011 17:04:42 +0000

Trans: Updated Globe translator, by Frank Bennett. 


Diffstat:
M translators/The Boston Globe.js  | 275 ++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------

1 file changed, 176 insertions(+), 99 deletions(-)
diff --git a/translators/The Boston Globe.js b/translators/The Boston Globe.js
@@ -1,110 +1,189 @@
 {
-	"translatorID":"1f245496-4c1b-406a-8641-d286b3888231",
-	"translatorType":4,
-	"label":"The Boston Globe",
-	"creator":"Adam Crymble",
-	"target":"http://(www|search).boston.com/",
-	"minVersion":"1.0.0b4.r5",
-	"maxVersion":"",
-	"priority":100,
-	"inRepository":true,
-	"lastUpdated":"2008-06-06 08:45:00"
+        "translatorID": "1f245496-4c1b-406a-8641-d286b3888231",
+        "label": "The Boston Globe",
+        "creator": "Adam Crymble and Frank Bennett",
+        "target": "^http://(www|search|articles)\\.boston\\.com/",
+        "minVersion": "1.0.0b4.r5",
+        "maxVersion": "",
+        "priority": 100,
+        "inRepository": false,
+        "translatorType": 4,
+        "lastUpdated": "2011-05-06 20:57:16"
 }
 
+/*
+ * Sample URLs
+ *
+ * [Original request -- uncommon page format, no embedded metadata of any kind]
+ * http://articles.boston.com/2011-05-03/news/29500032_1_bouncer-assault-local-restaurant
+ *
+ * [More common page formats, marginally reliable metadata in a comment block]
+ * http://www.boston.com/yourtown/news/charlestown/2011/04/meet_charlestowns_youth_of_the.html
+ * http://www.boston.com/business/articles/2011/05/05/oil_drops_below_100_per_barrel/
+ * http://www.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/
+
+ * Support for search results will require rewriting scrape(..) to use only regular expressions
+ */
+
 function detectWeb(doc, url) {
+	var namespace = doc.documentElement.namespaceURI;
+	var nsResolver = namespace ? function(prefix) {
+	}: null;
+	
 	if (url.match("search.boston.com")) {
-		return "multiple";
-	} else if (doc.evaluate('//div[@id="headTools"]/h1', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
+		// Search disabled until cross-domain can be dealt with
+		return false;
+		var results =  doc.evaluate('//div[@class="resultsMain"]//div[@class="regTZ"]/a[@class="titleLink"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
+		if (results.iterateNext()) {
+			return "multiple";
+		} else {
+			return false;
+		}
+	} else if (url.match(/(\/[0-9]{4}\/[0-9]{2}\/|[0-9]{4}-[0-9]{2}-[0-9]{2})/)) {
 		return "newspaperArticle";
-	} else if (doc.evaluate('//div[@id="blogEntry"]/h1/a', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
-		return "blogPost";
 	} 
 }
 
-//Boston Globe and Boston.com Translator. Code by Adam Crymble
+//Boston Globe and Boston.com Translator. Original code by Adam Crymble
+// Rewritten by Frank Bennett, 2011
 
-function scrape (doc, url) {
-	var namespace = doc.documentElement.namespaceURI;
-	var nsResolver = namespace ? function(prefix) {
-	}: null;
-		
-	//sets variables that remain constant in both formats
-					
-		if (doc.evaluate('//span[@id="dateline"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext())  {
-			var xPathDateResults = doc.evaluate ('//span[@id="dateline"]', doc, nsResolver, XPathResult.ANY_TYPE, null);	
+function sniffComment (elem) {
+	if (!elem) {
+		return elem;
+	}
+	for (var i = 0, ilen = elem.childNodes.length; i < ilen; i += 1) {
+		if (elem.childNodes[i].nodeName === "#comment") {
+			return elem.childNodes[i].nodeValue;
 		}
-		
-		if (doc.evaluate('//span[@id="byline"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext())  {
-			var xPathAuthorResults= doc.evaluate ('//span[@id="byline"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
-		}	
-		
-	
-	//sets variables unique to the blog posts on Boston.com		
-	
-		if (doc.evaluate('//div[@id="blogEntry"]/h1/a', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) {
-			
-			var newItem =new Zotero.Item("blogPost");
-			newItem.publicationTitle = "Boston.com";
-			
-			//title
-			var xPathTitle = '//div[@id="blogEntry"]/h1/a';
-			
-			//date
-			var articleDate = xPathDateResults.iterateNext().textContent;
-			newItem.date = articleDate;
-			
-			//author
-			var articleAuthor = xPathAuthorResults.iterateNext().textContent.replace(/Posted by /i, '');
-			articleAuthor = articleAuthor.split(',');
-			var authorName = articleAuthor[0].split("and ");
-	
-	//else it sets the variables unique to the articles on the Boston Globe	
-	
-		} else if (doc.evaluate('//div[@id="headTools"]/h1', doc, null, XPathResult.ANY_TYPE, null).iterateNext())  {
-			
-			var newItem = new Zotero.Item("newspaperArticle");
-			newItem.publicationTitle = "The Boston Globe";
-		
-			//title
-			var xPathTitle = '//div[@id="headTools"]/h1';
-			
-			//date
-			if (doc.evaluate('//span[@id="dateline"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext())  {
-				var articleDate = xPathDateResults.iterateNext().textContent;
-				if (articleDate.match('/')) {
-					articleDate = articleDate.split('/');
-				newItem.date = articleDate[1];	
+	}
+	return false;
+}
+
+function findMagicComment (doc) {
+	var hideMeElems = doc.getElementsByClassName("hideMe");
+	for (var i = 0, ilen = hideMeElems.length; i < ilen; i += 1) {
+		var elem = hideMeElems.item(i);
+		var sniff = sniffComment(elem);
+		if (sniff) {
+			return sniff;
+		}
+	}
+	var contentElem = doc.getElementById("content");
+	return sniffComment(contentElem);
+}
+
+function findAuthorString (doc, newItem) {
+	var authors = "";
+	var bylineElem = false;
+	var bylineElems = doc.getElementsByClassName("byline");
+	if (bylineElems.length) {
+		bylineElem = bylineElems.item(0);
+	}
+	if (!bylineElem) {
+		var bylineElem = doc.getElementById('byline');
+	}
+	if (bylineElem) {
+		authors = bylineElem.textContent;
+		authors = authors.replace("\n", " ", "g");
+		if (authors.match(/[Pp]osted\s+by\s+/)) {
+			newItem.itemType = "blogPost";
+		}
+		authors = authors.replace(/^\s*(?:[Bb]y|[Pp]osted\s+by)\s+(.*)/, "$1");
+	}
+	return authors;
+}
+
+function scrape (doc, url) {
+	// The site content is pretty chaotic, we do our best.
+
+	// There are two independent blocks set-and-save blocks
+	// below.
+
+	// Many pages seem to have metadata embedded in a comment
+	// The date and headline info look reliable, but
+	// the byline is a disaster, to be used only
+	// if absolutely necessary.
+	var magicComment = findMagicComment(doc);
+	if (magicComment) {
+		// Blind acceptance
+		var newItem =new Zotero.Item("newspaperArticle");
+		newItem.publicationTitle = "Boston.com";
+		// URL	
+		newItem.url = doc.location.href;
+		// Attachment
+		newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"});
+		// Now try to get some citation details (go ahead, try)
+		var info = magicComment.replace('\n','','g');
+		newItem.title = Zotero.Utilities.unescapeHTML(info.replace(/.*<headline>(.*)<\/headline>.*/,"$1"));
+		newItem.date = info.replace(/.*<date>(.*)<\/date>.*/,"$1");
+		var authors = findAuthorString(doc, newItem);
+		if (!authors) {
+			var authors = info.replace(/.*<byline>(.*)<\/byline>.*/,"$1");
+			if (authors.toLowerCase() === authors) {
+				authors = info.replace(/.*<teasetext>(.*)<\/teasetext>.*/, "$1");
+				var m = authors.match(/^(?:[Bb]y\s+)*([^ ,]+).*/);
+				if (m) {
+					authors = m[1];
 				} else {
-					newItem.date = articleDate;
+					authors = "";
 				}
-				
-			}			
-			
-			//author(s)
-				var articleAuthor = xPathAuthorResults.iterateNext().textContent.replace(/^\s*|\s*$/g, '');
-				articleAuthor= articleAuthor.substr(3);
-				var authorName = articleAuthor.split("and ");
-			
-			
-			//byline	
-			if (doc.evaluate('//div[@id="headTools"]/h2', doc, null, XPathResult.ANY_TYPE, null).iterateNext())  {		
-				newItem.abstractNote = doc.evaluate ('//div[@id="headTools"]/h2', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
 			}
 		}
-			
-		//creates title using xPaths defined above
-			var xPathTitleResults = doc.evaluate (xPathTitle, doc, nsResolver, XPathResult.ANY_TYPE, null);
-			newItem.title = xPathTitleResults.iterateNext().textContent;
-		
-		//pushes author(s)	
-			
-			for (var i=0; i<authorName.length; i++) {
-				newItem.creators.push(Zotero.Utilities.cleanAuthor(authorName[i], "author"));
-			}	
-		
+		authors = authors.split(/,*\s+and\s+/);
+		authors[authors.length - 1] = authors[authors.length - 1].split(/,\s+/)[0];
+		authors = authors.join(", ");
+		authors = authors.split(/,\s+/);
+		for (var j = 0, jlen = authors.length; j < jlen; j += 1) {
+			var author = Zotero.Utilities.cleanAuthor(authors[j], 'author');
+			if (author.lastName) {
+				newItem.creators.push(author);
+			}
+		}
+		newItem.complete();
+	}
+
+
+	// Information block
+	var infoElem = doc.getElementById("mod-article-byline");
+	if (infoElem) {
+		var newItem = new Zotero.Item("newspaperArticle");
+		newItem.publicationTitle = "Boston.com";
+		// URL	
 		newItem.url = doc.location.href;
-			
+		newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"});
+
+		// Date
+		var dateElem = infoElem.getElementsByClassName('pubdate');
+		if (dateElem.length) {
+			newItem.date = dateElem.textContent;
+		}
+
+		// Authors
+		for (var i = 0, ilen = infoElem.childNodes.length; i < ilen; i += 1) {
+			var node = infoElem.childNodes.item(i);
+			if (node.nodeName === 'SPAN') {
+				if ('By' === node.textContent.slice(0,2)) {
+					
+					var authors = node.textContent.slice(3);
+					authors = authors.split(/(?:, |,*\s+and\s+)/);
+					for (var j = 0, jlen = authors.length; j < jlen; j += 1) {
+						var author = Zotero.Utilities.cleanAuthor(authors[j], 'author');
+						newItem.creators.push(author);
+					}
+				}
+			}
+		}
+		
+		// Title	
+		var headerElem = doc.getElementById('mod-article-header');
+		if (headerElem) {
+			var h = headerElem.getElementsByTagName('h1');
+			if (h.length) {
+				newItem.title = h[0].textContent;
+			}
+		}
 		newItem.complete();
+	}
 }
 
 
@@ -116,10 +195,9 @@ function doWeb (doc, url) {
 	var uris= new Array();
 
 	if (detectWeb(doc, url) == "multiple") {
-		var items = new Object();
+		var items = {};
 		var result =  doc.evaluate('//div[@class="regTZ"]/a[@class="titleLink"]', doc, nsResolver, XPathResult.ANY_TYPE, null);
 		var elmt = result.iterateNext();
-		Zotero.debug(elmt);
 		while (elmt) {
 			//items.push(elmt.href);
 			items[elmt.href] = elmt.textContent;
@@ -135,9 +213,9 @@ function doWeb (doc, url) {
 		for (var i in items) {
 			uris.push(i);
 		}
-	} else
-		uris.push(url);
-		Zotero.debug(uris);
-	Zotero.Utilities.processDocuments(uris, scrape, function() {Zotero.done();});
-	Zotero.wait();
-}
-\ No newline at end of file
+		Zotero.Utilities.processDocuments(uris, scrape, Zotero.done);
+		Zotero.wait();
+	} else {
+		scrape(doc, url);
+	}
+}

	www Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| Submodules \| README \| LICENSE