www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 903051e471c57edaa7afb8b44923415c10afa52d
parent c373786a1ef95a41ed1076afce88659a953f9109
Author: Avram Lyon <ajlyon@gmail.com>
Date:   Sun, 22 Aug 2010 22:40:16 +0000

Rewritten Stuff.co.nz translator by Sopheak Hean.


Diffstat:
Mtranslators/Stuff.co.nz.js | 519+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 440 insertions(+), 79 deletions(-)

diff --git a/translators/Stuff.co.nz.js b/translators/Stuff.co.nz.js @@ -1,107 +1,469 @@ { - "translatorID":"631ff0c7-2e64-4279-a9c9-ad9518d40f2b", - "translatorType":4, - "label":"Stuff.co.nz", - "creator":"Michael Berkowitz", - "target":"^http://(www.)?stuff.co.nz/", - "minVersion":"1.0.0b4.r5", - "maxVersion":"", - "priority":100, - "inRepository":true, - "lastUpdated":"2007-08-14 22:15:00" + "translatorID":"386c7e75-eef4-47b1-b5a6-0faa3cfa4f44", + "label":"Stuff.co.nz", + "creator":"Sopheak Hean (University of Waikato, Faculty of Education)", + "target":"^http://(www\\.)?stuff\\.co\\.nz/", + "minVersion":"1.0", + "maxVersion":"", + "priority":100, + "inRepository":"1", + "translatorType":4, + "lastUpdated":"2010-08-23 00:34:34" } +/* + Stuff.co.nz Translator- Parses Stuff.co.nz articles and creates Zotero-based metadata + Copyright (C) 2010 Sopheak Hean, University of Waikato, Faculty of Education + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see <http://www.gnu.org/licenses/>. +*/ + +/* Stuff.co.nz does not have an ISSN because it is not a newspaper publisher. Stuff.co.nz is a collection of newspaper articles from around the country*/ + function detectWeb(doc, url) { - if ((doc.location.href.indexOf("search-results") != -1) || (doc.location.href.indexOf("/blogs/blogs/") != -1 )) { - return "multiple"; - } else if ((doc.location.href.indexOf("blogs") != -1) && (url != "http://www.stuff.co.nz/blogs/blogs") && (url != "http://stuff.co.nz/blogs/blogs")) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == "x" ) return namespace; else return null; + } : null; + var definePath = '//div[@class="blog_content"]'; + var XpathObject = doc.evaluate(definePath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); +if (XpathObject){ return "blogPost"; - } else if (doc.location.href.indexOf("html") == (doc.location.href.length - 4)){ + } + + else { + var definePath = '//div[@class="story_landing"]'; + var XpathObject = doc.evaluate(definePath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (XpathObject){ return "newspaperArticle"; + } } + +} + +function myUpperCaseFunction(input){ + /*Will define one later*/ } + function scrape(doc, url) { - if (doc.location.href.indexOf("html") != -1) { - var newItem = new Zotero.Item("newspaperArticle"); + + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == 'x') return namespace; else return null; + } : null; + var url = doc.location.href; + var splitIntoArray; + var fullName=""; + var emptyString =" "; + var firstName; var lastName; + /*==========================Blog Post===========================*/ + + if (detectWeb(doc, url) =="blogPost"){ + + var newItem = new Zotero.Item('blogPost'); newItem.url = doc.location.href; + //newItem.title = "No Title Found"; newItem.publicationTitle = "Stuff.co.nz"; - newItem.title = doc.title.split(" - ")[0]; + newItem.language = "English"; + + //Get Author + try { /*Try and Catch if encounter erro */ - //abstract - var xpath = '//div[@id="leftcol_story"]/p/strong'; - newItem.abstractNote = Zotero.Utilities.cleanString(doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent); + var blogAuthor = "//div[@id='left_col']/span"; + var blogAuthorObject = doc.evaluate(blogAuthor, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (blogAuthorObject) { + + if (blogAuthorObject.textContent.replace(/\s*/g,'') ==""){ + newItem.creators =blogAuthorObject.textContent.replace(/\s*/g,''); + } + + else{ + blogAuthorObject = blogAuthorObject.textContent; + if(blogAuthorObject.match(/[\s\n\r\t]+-[\s\n\r\t]+[a-zA-Z\s\n\r\t]*/g)){ + blogAuthorObject = blogAuthorObject.replace(/([\s\n\r\t]+-[\s\n\r\t]+[a-zA-Z\s\n\r\t]*)/g, '').replace(/\bBy \b/g,''); + splitIntoArray = blogAuthorObject.split (" "); + for (var i = 0; i < splitIntoArray.length; i++){ + firstName = splitIntoArray[i].substring(0,1).toUpperCase(); + lastName = splitIntoArray[i].substring(1).toLowerCase(); + fullName += firstName + lastName + emptyString; + + } + newItem.creators.push(Zotero.Utilities.cleanAuthor(fullName , "author")); + } + + else { + splitIntoArray = blogAuthorObject.replace(/\bBy \b/g,'').split (" "); + for (var i = 0; i < splitIntoArray.length; i++){ + firstName = splitIntoArray[i].substring(0,1).toUpperCase(); + lastName = splitIntoArray[i].substring(1).toLowerCase(); + fullName += firstName + lastName + emptyString; + + } + newItem.creators.push(Zotero.Utilities.cleanAuthor(fullName , "author")); } + } + } + } catch (err) { + newItem.creators ="error"; + + } + + //Title of the Article + var getBlogTitle = "//span[@class='hbox_top_title headlines_title']/a"; + var getBlogTitleObject = doc.evaluate(getBlogTitle, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (getBlogTitleObject){ + newItem.blogTitle =getBlogTitleObject.textContent.replace(/\s+\bHeadlines\b/g, ''); + } + newItem.shortTitle = doShortTitle(doc,url); + newItem.title= doTitle(doc, url); + newItem.date = doDate(doc, url); + newItem.abstractNote = doAbstract(doc, url); + newItem.websiteType = "Newspaper"; + newItem.attachments.push({url:url, title:"Stuff.co.nz Snapshot", mimeType:"text/html"}); + newItem.complete(); + } + + + + /* ======================Newspaper Article========================*/ + + else if (detectWeb(doc, url) =="newspaperArticle"){ + + var newItem = new Zotero.Item('newspaperArticle'); + newItem.url = doc.location.href; + //newItem.title = "No Title Found"; + + //Get extended publisher if there is any then replace with stuff.co.nz + var myPublisher = '//span[@class="storycredit"]'; + + var myPublisherObject = doc.evaluate(myPublisher , doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (myPublisherObject) { + var realPublisher = myPublisherObject.textContent; + if (realPublisher.match(/\bBy[\s\n\r\t]+[a-zA-Z\s\r\t\n]*-[\s\n\r\t]*/g)){ + realPublisher = realPublisher.replace (/\bBy[\s\n\r\t]+[a-zA-Z\s\r\t\n]*-[\s\n\r\t]*/g, '').replace(/^\s*|\s*$/g, ''); + newItem.publicationTitle = realPublisher; + } else { + newItem.publicationTitle = "Stuff.co.nz"; + } + + } else { + newItem.publicationTitle = "Stuff.co.nz"; + } + + newItem.language = "English"; - //date and author - var xpath = '//div[@id="story_headline"]'; - var info = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(/\n+/)[2].split(" | "); + //Short Title + newItem.shortTitle = doShortTitle(doc,url); + - newItem.date = Zotero.Utilities.cleanString(info[1].split(",")[1]); + //get Abstract + newItem.abstractNote = doAbstract(doc, url); + var authorXPath = '//span[@class="storycredit"]'; - var author = Zotero.Utilities.cleanString(info[0]); - if (author.substr(0,2).toLowerCase() == "by") { - author = author.substr(3); - if (author.indexOf(" - ") != -1) { - author = author.split(" - ")[0].split(" "); - } else { - author = author.split(" "); + var authorXPathObject = doc.evaluate(authorXPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (authorXPathObject){ + var authorArray = new Array("NZPA", "The Press", "The Dominion Post"); + authorXPathObject = authorXPathObject.textContent; + + if(authorXPathObject.match(/[\s\n\r\t]+-[\s\n\r\t]+\b[a-zA-Z\s\n\r\t]*|^\s+\bBy\s*/g)){ + authorXPathObject = authorXPathObject.replace(/([\s\n\r\t]+-[\s\n\r\t]+\b[a-zA-Z\s\n\r\t]*)|\b.co.nz|\b.com|(-[a-zA-Z0-9]*)/g, ''); + var authorString = authorXPathObject.replace(/^\s+\bBy\s*|^\s+\bBY\s*/g, ''); + + if (authorString.match(/\W\band\W+/g)){ + authorTemp = authorString.replace(/\W\band\W+/g, ', '); + authorArray = authorTemp.split(", "); + + } else if (!authorString.match(/\W\band\W+/g)) + { + authorArray = authorString.toLowerCase(); + } + if( authorArray instanceof Array ) { + for (var i in authorArray){ + splitIntoArray = authorArray[i].split (" "); + for (var i = 0; i < splitIntoArray.length; i++){ + firstName = splitIntoArray[i].substring(0,1).toUpperCase(); + lastName = splitIntoArray[i].substring(1).toLowerCase(); + fullName += firstChar + lastChar + emptyString; + + + } + newItem.creators.push(Zotero.Utilities.cleanAuthor(JoinString, "author")); + + } + + } else { + + + if (authorString.match(/\W\bof\W+/g)){ + authorTemp = authorString.replace (/\W\bof\W(.*)/g, ''); + splitIntoArray = authorTemp.split (" "); + for (var i = 0; i < splitIntoArray.length; i++){ + firstName = splitIntoArray[i].substring(0,1).toUpperCase(); + lastName = splitIntoArray[i].substring(1).toLowerCase(); + fullName += firstChar + lastChar + emptyString; + + } + newItem.creators.push(Zotero.Utilities.cleanAuthor(JoinString, "author")); + + + } else { + + splitIntoArray = authorArray.split (" "); + for (var i = 0; i < splitIntoArray.length; i++){ + firstName = splitIntoArray[i].substring(0,1).toUpperCase(); + lastName = splitIntoArray[i].substring(1).toLowerCase(); + fullName += firstName+ lastName + emptyString; + + + } + newItem.creators.push(Zotero.Utilities.cleanAuthor(fullName, "author")); + } + + } + } else { + + if(authorXPathObject.match(/[\s\n\r]+/g)){ + + authorXPathObject = authorXPathObject.replace(/^\s*|\s*$/g, '').replace(/\s+/g, '-'); + newItem.creators.push(Zotero.Utilities.cleanAuthor(authorXPathObject, "author")); + } + else { newItem.creators.push(Zotero.Utilities.cleanAuthor(authorXPathObject , "author"));} + } - for (var i = 0 ; i < author.length ; i++) { - author[i] = author[i][0] + author[i].substr(1).toLowerCase(); - var creator = author.join(" "); + + } else{ + newItem.creators =""; + } + + //Title of the Article + newItem.title= doTitle(doc, url); + + + //Section of the Article + + var current = '//li/a[@class="current"]'; + var currentObject = doc.evaluate(current, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (currentObject){ + currentObject = currentObject.textContent; + + var articleSection = '//li[@class="mid_nav_item"]/a'; + var articleSectionObject = doc.evaluate(articleSection , doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (articleSectionObject){ + articleSectionObject = articleSectionObject .textContent; + switch (articleSectionObject){ + case "National": + case "Business": + case "Sport": + case "Politics": + newItem.place= "New Zealand"; + newItem.section = currentObject; + break; + + case "World": + newItem.place= "World"; + newItem.section = currentObject; break; + + default: + newItem.section = articleSectionObject;break; + } + } + var SectionType = '//li[@class="current_nav_item"]/a'; + var SectionTypeObject = doc.evaluate(SectionType, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (SectionType){ + + SectionTypeObject = SectionTypeObject.textContent; + switch (SectionTypeObject) { + case "National": + case "Crime": + case "Education": + case "Health": + case "Politics": + case "Environment": + case "Business": + + newItem.place= "New Zealand"; + newItem.section = currentObject; break; + + case "Opinion": + case "Rugby": + case "Soccer": + case "Cricket": + case "Basketball": + case "Fishing": + case "League": + case "Scoreboard": + case "Football": + case "Golf": + case "Motorsport": + case "Netball": + case "Tennis": + + newItem.section ="Sport"; break; + default: + newItem.section = SectionTypeObject; break; + } + } + } + else { + var SectionType = '//li[@class="current_nav_item"]/a'; + var SectionTypeObject = doc.evaluate(SectionType, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (SectionType){ + + SectionTypeObject = SectionTypeObject.textContent; + + switch (SectionTypeObject) { + case "National": + case "Crime": + case "Education": + case "Health": + case "Politics": + case "Environment": + case "Business": + newItem.place= "New Zealand"; + newItem.section = SectionTypeObject; break; + + default: + newItem.section =SectionTypeObject; break; + } + } - newItem.creators.push(Zotero.Utilities.cleanAuthor(creator, "author")); - } else { - newItem.extra = author; } - } else if (doc.location.href.indexOf("blogs") != -1) { - var newItem = new Zotero.Item("blogPost"); - newItem.url = doc.location.href; + //Snapshot of the web page. + newItem.attachments.push({url:url, title:"Stuff.co.nz Snapshot", + mimeType:"text/html"}); + + //Call Do date function to make it cleaner in scape. This way things are easier to follow. + newItem.date = doDate(doc,url); + newItem.complete(); + + } + +} + - //post title - var xpath = '//div[@class="post"]/h2[@class="storytitle"]/a'; - newItem.title = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent; - - //date and author - var xpath = '//div[@class="meta"][@id="postdate"]' - var info = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent.split(" | "); - var byline = Zotero.Utilities.cleanString(info[0]).split(" in "); - newItem.creators.push(Zotero.Utilities.cleanAuthor(byline[0], "author")); - newItem.blogTitle = byline[1]; - var date = Zotero.Utilities.cleanString(info[1]).split("m "); - newItem.date = date[1]; +function doShortTitle(doc, url){ + + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == 'x') return namespace; else return null; + } : null; + var shortTitle=""; + var subTitle = '//div[@id="left_col"]/h2'; + var subTitleObject = doc.evaluate(subTitle, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (subTitleObject){ + shortTitle= subTitleObject.textContent.replace(/^\s*|\s*$/g, ''); + return shortTitle; + } else { + return shortTitle; } - newItem.complete(); + } -function doWeb(doc, url) { - var URLS = new Array(); - - //multiple - if ((url.indexOf("search-results") != -1) || (url.indexOf("blogs/blogs/") != -1)) { - if (url.indexOf("search-results") != -1) { - var xpath = '//div[@id="leftcol_story"]/p/a'; - } else if (url.indexOf("blogs/blogs/") != -1) { - var xpath = '//h2[@class="storytitle"]/a'; - } +function doAbstract(doc, url){ - var items = new Object(); - var titles = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null); - var newTitle = titles.iterateNext(); - while (newTitle) { - items[newTitle.href] = newTitle.textContent; - newTitle = titles.iterateNext(); - } + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == 'x') return namespace; else return null; + } : null; + var abstractString=""; + var a= "//meta[@name='description']"; + var abs= doc.evaluate(a, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (abs){ + abstractString = abs.content; + return abstractString; - items = Zotero.selectItems(items); - - for (var i in items) { - URLS.push(i); + } + return abstractString; + +} + +function doTitle(doc, url){ + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == 'x') return namespace; else return null; + } : null; + + var temp=""; + var getTitle = '//div[@id="left_col"]/h1'; + var getTitleObject = doc.evaluate(getTitle, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + if (getTitleObject) { + var temp=getTitleObject.textContent.replace(/^\s*|\s*$/g, ''); + return temp; + } + return temp; +} + +function doDate(doc, url){ + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == 'x') return namespace; else return null; + } : null; + + var dateXpath = "//div[@id='toolbox']/div[3]"; + var dateXpathObject = doc.evaluate(dateXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + try { + if (dateXpathObject){ + var storeDateValue = dateXpathObject.textContent.replace(/\b(Last updated )\d{0,9}:\d{0,9} /g,''); + + var ArrayDate = storeDateValue.split('/'); + var emptyString = " "; + var comma = ", "; + var DateString; + var ArrayMonth = new Array("Jan", "Feb", "Mar", "Apr", "May", "Jun", "July", "Aug", "Sep", "Oct", "Nov", "Dec"); + var ArrayNumber = new Array("01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "11", "12"); + for (var i=0; i <ArrayNumber.length; i++){ + if(ArrayDate[1] ==ArrayNumber[i]) { + + ArrayNumber[i] = ArrayMonth[i]; + var month = ArrayNumber[i] + emptyString; + } + DateString = month + ArrayDate[0] + comma + ArrayDate[2]; + + } + return DateString; + } else { + DateString = ""; + return DateString; } - } else { - URLS.push(url); + }catch (err) { + + DateString = ""; } + return DateString; +} + + +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == 'x') return namespace; else return null; + } : null; + + //var articles = new Array(); - Zotero.Utilities.processDocuments(URLS, scrape, function() {Zotero.done();}); + if (detectWeb(doc, url) == "newspaperArticle") { + var articles = [url]; + + }else if (detectWeb(doc, url) == "blogPost") { + var articles = [url]; + + } + + + //Zotero.debug(articles); + Zotero.Utilities.processDocuments(articles, scrape, function() {Zotero.done();}); Zotero.wait(); -} -\ No newline at end of file + +} +