www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 107632d97048314fb495dcd339bdf393acec95c7
parent fd209f98dc8a09444ee913e58788f29ad7a89fea
Author: Avram Lyon <ajlyon@gmail.com>
Date:   Fri,  6 May 2011 17:04:42 +0000

Trans: Updated Globe translator, by Frank Bennett. 


Diffstat:
Mtranslators/The Boston Globe.js | 275++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
1 file changed, 176 insertions(+), 99 deletions(-)

diff --git a/translators/The Boston Globe.js b/translators/The Boston Globe.js @@ -1,110 +1,189 @@ { - "translatorID":"1f245496-4c1b-406a-8641-d286b3888231", - "translatorType":4, - "label":"The Boston Globe", - "creator":"Adam Crymble", - "target":"http://(www|search).boston.com/", - "minVersion":"1.0.0b4.r5", - "maxVersion":"", - "priority":100, - "inRepository":true, - "lastUpdated":"2008-06-06 08:45:00" + "translatorID": "1f245496-4c1b-406a-8641-d286b3888231", + "label": "The Boston Globe", + "creator": "Adam Crymble and Frank Bennett", + "target": "^http://(www|search|articles)\\.boston\\.com/", + "minVersion": "1.0.0b4.r5", + "maxVersion": "", + "priority": 100, + "inRepository": false, + "translatorType": 4, + "lastUpdated": "2011-05-06 20:57:16" } +/* + * Sample URLs + * + * [Original request -- uncommon page format, no embedded metadata of any kind] + * http://articles.boston.com/2011-05-03/news/29500032_1_bouncer-assault-local-restaurant + * + * [More common page formats, marginally reliable metadata in a comment block] + * http://www.boston.com/yourtown/news/charlestown/2011/04/meet_charlestowns_youth_of_the.html + * http://www.boston.com/business/articles/2011/05/05/oil_drops_below_100_per_barrel/ + * http://www.boston.com/lifestyle/articles/2011/04/28/anticipation_grows_for_mfas_art_in_bloom_festival/ + + * Support for search results will require rewriting scrape(..) to use only regular expressions + */ + function detectWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + }: null; + if (url.match("search.boston.com")) { - return "multiple"; - } else if (doc.evaluate('//div[@id="headTools"]/h1', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { + // Search disabled until cross-domain can be dealt with + return false; + var results = doc.evaluate('//div[@class="resultsMain"]//div[@class="regTZ"]/a[@class="titleLink"]', doc, nsResolver, XPathResult.ANY_TYPE, null); + if (results.iterateNext()) { + return "multiple"; + } else { + return false; + } + } else if (url.match(/(\/[0-9]{4}\/[0-9]{2}\/|[0-9]{4}-[0-9]{2}-[0-9]{2})/)) { return "newspaperArticle"; - } else if (doc.evaluate('//div[@id="blogEntry"]/h1/a', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - return "blogPost"; } } -//Boston Globe and Boston.com Translator. Code by Adam Crymble +//Boston Globe and Boston.com Translator. Original code by Adam Crymble +// Rewritten by Frank Bennett, 2011 -function scrape (doc, url) { - var namespace = doc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - }: null; - - //sets variables that remain constant in both formats - - if (doc.evaluate('//span[@id="dateline"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - var xPathDateResults = doc.evaluate ('//span[@id="dateline"]', doc, nsResolver, XPathResult.ANY_TYPE, null); +function sniffComment (elem) { + if (!elem) { + return elem; + } + for (var i = 0, ilen = elem.childNodes.length; i < ilen; i += 1) { + if (elem.childNodes[i].nodeName === "#comment") { + return elem.childNodes[i].nodeValue; } - - if (doc.evaluate('//span[@id="byline"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - var xPathAuthorResults= doc.evaluate ('//span[@id="byline"]', doc, nsResolver, XPathResult.ANY_TYPE, null); - } - - - //sets variables unique to the blog posts on Boston.com - - if (doc.evaluate('//div[@id="blogEntry"]/h1/a', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - - var newItem =new Zotero.Item("blogPost"); - newItem.publicationTitle = "Boston.com"; - - //title - var xPathTitle = '//div[@id="blogEntry"]/h1/a'; - - //date - var articleDate = xPathDateResults.iterateNext().textContent; - newItem.date = articleDate; - - //author - var articleAuthor = xPathAuthorResults.iterateNext().textContent.replace(/Posted by /i, ''); - articleAuthor = articleAuthor.split(','); - var authorName = articleAuthor[0].split("and "); - - //else it sets the variables unique to the articles on the Boston Globe - - } else if (doc.evaluate('//div[@id="headTools"]/h1', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - - var newItem = new Zotero.Item("newspaperArticle"); - newItem.publicationTitle = "The Boston Globe"; - - //title - var xPathTitle = '//div[@id="headTools"]/h1'; - - //date - if (doc.evaluate('//span[@id="dateline"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - var articleDate = xPathDateResults.iterateNext().textContent; - if (articleDate.match('/')) { - articleDate = articleDate.split('/'); - newItem.date = articleDate[1]; + } + return false; +} + +function findMagicComment (doc) { + var hideMeElems = doc.getElementsByClassName("hideMe"); + for (var i = 0, ilen = hideMeElems.length; i < ilen; i += 1) { + var elem = hideMeElems.item(i); + var sniff = sniffComment(elem); + if (sniff) { + return sniff; + } + } + var contentElem = doc.getElementById("content"); + return sniffComment(contentElem); +} + +function findAuthorString (doc, newItem) { + var authors = ""; + var bylineElem = false; + var bylineElems = doc.getElementsByClassName("byline"); + if (bylineElems.length) { + bylineElem = bylineElems.item(0); + } + if (!bylineElem) { + var bylineElem = doc.getElementById('byline'); + } + if (bylineElem) { + authors = bylineElem.textContent; + authors = authors.replace("\n", " ", "g"); + if (authors.match(/[Pp]osted\s+by\s+/)) { + newItem.itemType = "blogPost"; + } + authors = authors.replace(/^\s*(?:[Bb]y|[Pp]osted\s+by)\s+(.*)/, "$1"); + } + return authors; +} + +function scrape (doc, url) { + // The site content is pretty chaotic, we do our best. + + // There are two independent blocks set-and-save blocks + // below. + + // Many pages seem to have metadata embedded in a comment + // The date and headline info look reliable, but + // the byline is a disaster, to be used only + // if absolutely necessary. + var magicComment = findMagicComment(doc); + if (magicComment) { + // Blind acceptance + var newItem =new Zotero.Item("newspaperArticle"); + newItem.publicationTitle = "Boston.com"; + // URL + newItem.url = doc.location.href; + // Attachment + newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"}); + // Now try to get some citation details (go ahead, try) + var info = magicComment.replace('\n','','g'); + newItem.title = Zotero.Utilities.unescapeHTML(info.replace(/.*<headline>(.*)<\/headline>.*/,"$1")); + newItem.date = info.replace(/.*<date>(.*)<\/date>.*/,"$1"); + var authors = findAuthorString(doc, newItem); + if (!authors) { + var authors = info.replace(/.*<byline>(.*)<\/byline>.*/,"$1"); + if (authors.toLowerCase() === authors) { + authors = info.replace(/.*<teasetext>(.*)<\/teasetext>.*/, "$1"); + var m = authors.match(/^(?:[Bb]y\s+)*([^ ,]+).*/); + if (m) { + authors = m[1]; } else { - newItem.date = articleDate; + authors = ""; } - - } - - //author(s) - var articleAuthor = xPathAuthorResults.iterateNext().textContent.replace(/^\s*|\s*$/g, ''); - articleAuthor= articleAuthor.substr(3); - var authorName = articleAuthor.split("and "); - - - //byline - if (doc.evaluate('//div[@id="headTools"]/h2', doc, null, XPathResult.ANY_TYPE, null).iterateNext()) { - newItem.abstractNote = doc.evaluate ('//div[@id="headTools"]/h2', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent; } } - - //creates title using xPaths defined above - var xPathTitleResults = doc.evaluate (xPathTitle, doc, nsResolver, XPathResult.ANY_TYPE, null); - newItem.title = xPathTitleResults.iterateNext().textContent; - - //pushes author(s) - - for (var i=0; i<authorName.length; i++) { - newItem.creators.push(Zotero.Utilities.cleanAuthor(authorName[i], "author")); - } - + authors = authors.split(/,*\s+and\s+/); + authors[authors.length - 1] = authors[authors.length - 1].split(/,\s+/)[0]; + authors = authors.join(", "); + authors = authors.split(/,\s+/); + for (var j = 0, jlen = authors.length; j < jlen; j += 1) { + var author = Zotero.Utilities.cleanAuthor(authors[j], 'author'); + if (author.lastName) { + newItem.creators.push(author); + } + } + newItem.complete(); + } + + + // Information block + var infoElem = doc.getElementById("mod-article-byline"); + if (infoElem) { + var newItem = new Zotero.Item("newspaperArticle"); + newItem.publicationTitle = "Boston.com"; + // URL newItem.url = doc.location.href; - + newItem.attachments.push({url:doc.location.href,mimetype:"text/html",snapshot:true,title:"Boston.com page"}); + + // Date + var dateElem = infoElem.getElementsByClassName('pubdate'); + if (dateElem.length) { + newItem.date = dateElem.textContent; + } + + // Authors + for (var i = 0, ilen = infoElem.childNodes.length; i < ilen; i += 1) { + var node = infoElem.childNodes.item(i); + if (node.nodeName === 'SPAN') { + if ('By' === node.textContent.slice(0,2)) { + + var authors = node.textContent.slice(3); + authors = authors.split(/(?:, |,*\s+and\s+)/); + for (var j = 0, jlen = authors.length; j < jlen; j += 1) { + var author = Zotero.Utilities.cleanAuthor(authors[j], 'author'); + newItem.creators.push(author); + } + } + } + } + + // Title + var headerElem = doc.getElementById('mod-article-header'); + if (headerElem) { + var h = headerElem.getElementsByTagName('h1'); + if (h.length) { + newItem.title = h[0].textContent; + } + } newItem.complete(); + } } @@ -116,10 +195,9 @@ function doWeb (doc, url) { var uris= new Array(); if (detectWeb(doc, url) == "multiple") { - var items = new Object(); + var items = {}; var result = doc.evaluate('//div[@class="regTZ"]/a[@class="titleLink"]', doc, nsResolver, XPathResult.ANY_TYPE, null); var elmt = result.iterateNext(); - Zotero.debug(elmt); while (elmt) { //items.push(elmt.href); items[elmt.href] = elmt.textContent; @@ -135,9 +213,9 @@ function doWeb (doc, url) { for (var i in items) { uris.push(i); } - } else - uris.push(url); - Zotero.debug(uris); - Zotero.Utilities.processDocuments(uris, scrape, function() {Zotero.done();}); - Zotero.wait(); -} -\ No newline at end of file + Zotero.Utilities.processDocuments(uris, scrape, Zotero.done); + Zotero.wait(); + } else { + scrape(doc, url); + } +}