commit 3dcd3c8ca72246a675809635da82d6873cb374e2
parent d6c160fb5b69d3063dc607ab2be575272a4c4849
Author: Avram Lyon <ajlyon@gmail.com>
Date: Sun, 30 Jan 2011 14:18:59 +0000
Trans: Adding new TalisPrism translator by Emma Reisz and Will Smith.
Diffstat:
1 file changed, 435 insertions(+), 0 deletions(-)
diff --git a/translators/TalisPrism.js b/translators/TalisPrism.js
@@ -0,0 +1,434 @@
+{
+ "translatorID":"53f8d182-4edc-4eab-b5a1-141698a20202",
+ "label":"TalisPrism",
+ "creator":"William Smith and Emma Reisz",
+ "target":"/TalisPrism/(browseResults|doSearch)",
+ "minVersion":"1.0.0b4.r5",
+ "maxVersion":"",
+ "priority":100,
+ "inRepository":"1",
+ "translatorType":4,
+ "lastUpdated":"2010-11-15 11:35:54"
+}
+
+/* TalisPrism translator.
+ Version 1.1
+ By William Smith (http://www.willsmith.org/contactme)
+ and Emma Reisz
+
+TalisPrism is a library management system used by a number of universities
+and public bodies in the UK, Ireland and elsewhere.
+For example: http://qu-prism.qub.ac.uk/TalisPrism/
+and http://http://star.shef.ac.uk/TalisPrism/
+
+This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+*/
+
+
+// TalisPrism doesn't use metadata so everything must be scraped.
+
+function detectWeb(doc, url){
+
+ /* Can't differentiate multiple from single results by URL
+ as single search results have a search URL but display as browse.
+ Can't scrape the titles to differentiate between single and multiple as the display format
+ is too different to be scraped consistently.
+ Instead we differentiate by URL but make an exception for a solo result.
+ */
+ var search=searchTest(doc, url);
+
+ if (search==1) {
+ var doctype = 'multiple';
+ } else {doctype=docType(doc, url);
+ }
+ return doctype;
+}
+
+function docType (doc,url){
+ //Need xpaths to detect type.
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == "x" ) return namespace; else return null;
+ } : null;
+
+ // Best way to identify item type on an entry page is by its icon.
+ if (getXPath(doc, '//img[@alt="sound - disc"]/@alt').length) {
+ doctype = 'audioRecording';
+ } else if (getXPath(doc, '//img[@alt="Book"]/@alt').length) {
+ doctype = 'book';
+ } else if (getXPath(doc, '//img[@alt="video - disc"]/@alt').length) {
+ doctype = 'videoRecording';
+ } else {
+ doctype = 'document';
+ }
+ return doctype;
+}
+
+
+function searchTest (doc, url){
+
+ //Need xpaths to differentiate search and item pages.
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == "x" ) return namespace; else return null;
+ } : null;
+
+ var searchPage;
+ var search;
+ if (url.match(/doSearch/)) {
+ var resultCount;
+ var resultCountElements = new Array();
+ var resultCountText;
+ var resultCountPath = '//table/tbody/tr/td/table/tbody/tr/td[1]/font/span[@class="text"]/font';
+ var resultCountObject = doc.evaluate(resultCountPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ while (resultCountText = resultCountObject.iterateNext()) {
+ resultCountElements.push(resultCountText.textContent);
+ }
+ resultCount=resultCountElements[0];
+ if (resultCount == 1) {
+ search=0;
+ } else {
+ search=1;
+ }
+ } else {
+ var pageCount;
+ var pageCountElements = new Array();
+ var pageCountText;
+ var pageCountPath= '//tbody/tr/td[2]/font/span[@class="text"]/table/tbody/tr[2]/td/font/span[@class="text"]/table/tbody/tr/td[4]';
+ var pageCountObject = doc.evaluate(pageCountPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ while (pageCountText = pageCountObject.iterateNext()) {
+ pageCountElements.push(pageCountText.textContent);
+ }
+ pageCount=pageCountElements[0];
+ if (pageCount==undefined){
+ search=0;
+ } else if (pageCount.match(/Page/)){
+ search=1
+ } else {
+ search=0;
+ }
+ }
+ return search;
+}
+
+function getXPath ( doc, field ) {
+ xpath = field;
+
+ content = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null).iterateNext();
+
+ if (content)
+ return content.textContent;
+ else
+ return '';
+
+}
+
+//TalisPrism displays with labels. The getField function searches for the next different field after a label.
+
+function getField (doc, field) {
+
+ xpath='//span[@class="text"]';
+
+ content = doc.evaluate(xpath, doc, null, XPathResult.ANY_TYPE, null);
+
+ while (c = content.iterateNext())
+ {
+ if (c.textContent == field)
+ {
+ // OK, find the next field
+ while (val = content.iterateNext()) {
+
+ if (val && val.textContent != c.textContent)
+ {
+ return val.textContent;
+ }
+ }
+ }
+ }
+ return '';
+}
+
+function multiscrape(doc, url) {
+ url=doc.documentURI;
+ var item;
+ var doctype = docType(doc, url);
+ item = new Zotero.Item(doctype);
+ scrape(doc,url, item);
+}
+
+
+function soloscrape(doc, url) {
+ url=doc.documentURI;
+ var item;
+ item = new Zotero.Item(doctype);
+ scrape(doc,url, item);
+ return '';
+}
+
+
+function scrape(doc, url, item){
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace; else return null;
+ } : null;
+
+ // The fields often contain multiple data types and need some cleanup.
+ var title = getField(doc, 'Title');
+
+ if (title.length == 0) {
+ title = 'Unknown Title';
+ }
+ // If title includes a forward slash, omit the last bit.
+ if (title.match('/')) {
+ title = title.substring(0, title.lastIndexOf('/'));
+ }
+ title = title.replace(/^\s+|\s+$/g, '');
+ item.title = title;
+
+ var author = getField(doc, 'Author');
+ if (author.length) {
+ item.creators.push(Zotero.Utilities.cleanAuthor(author, "author", 1));
+ } else {
+ author = getField(doc, 'Other Author(s) / Title(s)');
+ if (author.length) {
+ item.creators.push(Zotero.Utilities.cleanAuthor(author, "author", 1));
+ }
+ }
+
+
+ // Place, publisher and publish date are in the same field. Format is usually "Place : Publisher, yyyy".
+
+ var publishing = getField(doc, 'Publisher');
+ if (publishing.length == 0) {
+ publishing = getField(doc, 'Published');
+ }
+ if (publishing.length == 0) {
+ publishing = getField(doc, 'Publication details');
+ }
+
+ if (publishing.match(/(13|14|15|16|17|18|19|20)\d\d/)) {
+ var pos = publishing.search(/(13|14|15|16|17|18|19|20)\d\d/);
+ item.date = publishing.substring(pos, publishing.lastIndexOf('.')).match(/\d\d\d\d/);
+ var place = publishing.substring(0, publishing.indexOf(':'));
+ item.place = place.replace(/^\s+|\s+$/g, '');
+ var publisher = publishing.substring(publishing.indexOf(':')+1, pos);
+ item.publisher = publisher.replace(/^\s+|\s+$|\,\s+$/g, '');
+ }
+
+
+ var isbn = getField(doc, 'ISBN');
+ if (isbn.length == 0) {
+ isbn = getField(doc, 'ISBN, etc.');
+ }
+
+ isbn=isbn.replace(/^\D+|\D+$/g, "");
+ item.ISBN = isbn.substring(0).match(/\d+/);
+
+ var series = getField(doc, 'Series');
+ var pos2 =series.lastIndexOf(';');
+ if (pos2==-1){
+ item.series=series.replace(/^\s+|\s+$/g, '');
+ }else{
+ var seriesName = series.substring(0, pos2);
+ item.series = seriesName.replace(/^\s+|\s+$/g, '');
+ var seriesNumber = series.substring(pos2+1);
+ item.seriesNumber = seriesNumber.replace(/^\s+|\s+$/g, '');
+ }
+
+ item.edition = getField(doc, 'Edition');
+
+ var physical = getField(doc, 'Physical details');
+ var numPages = physical.substring(0, physical.indexOf(':'));
+ item.numPages = numPages.replace(/^\s+|\s+$/g, '');
+
+ var physicaldetails = physical.substring(physical.indexOf(':')+1, physical.lastIndexOf('.'));
+ physicaldetails = physicaldetails.replace(/^\s+|\s+$/g, '');
+
+ var databasedetails = getField(doc, 'Cited/indexed in');
+ databasedetails = databasedetails.replace(/^\s+|\s+$/g, '');
+
+ item.extra = databasedetails + physicaldetails
+
+ item.attachments.push({url:url, title:"Snapshot of Library Page", mimeType:"text/html"});
+
+ var doctitle
+ doctitle = doc.title
+ if (doctitle == "TalisPrism"){
+ item.libraryCatalog =url.substring(url.indexOf('http'), url.indexOf('/TalisPrism'));
+ } else {
+ item.libraryCatalog = doctitle
+ }
+
+
+ /* We need to XPath to the call number as we cannot be sure about the previous cell,
+ so the label method won't work. Some items have multiple call numbers,
+ but a generalised XPath which retrieves multiple sets of location data (tr[2], tr[3] etc.)
+ also retrieves tr [1], which contains all the rest of the bibliographic entry.
+ The size of tr[1] varies and there is no consistent final item,
+ so instead of using a general XPath, we scrape tr[2], tr[3] and tr[4] successively into an array;
+ tr[5] is also scraped into the array, but if non-null, 'See record for additional call numbers.'
+ is returned as the final shelfmark. Note that each call number is itself scraped into an
+ array ('shelfmarkElements'), as we need both the Library and Shelfmark elements.
+ */
+
+ var shelfmark = new Array();
+ var callNumber = "";
+
+ //Need to test whether the search page has a sidebar showing as this shifts the classmarks.
+
+ var authorModePath='//td/table/tbody/tr/td[1]/font/span[@class="text"]/table/tbody/tr[2]/td/font/span[@class="text"]/font/b/span[@class="text"]/table/tbody/tr/td[2]';
+ var authorModeObject=doc.evaluate(authorModePath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ var browseModePath='//td/table/tbody/tr/td[1]/font/span[@class="text"]/table/tbody/tr/td[2]/font/span[@class="text"]/table/tbody/tr/td[1]';
+ var browseModeObject=doc.evaluate(browseModePath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ var shelfmarkPath = new Array();
+ shelfmarkPath[0] = '//td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr[2]/td';
+ shelfmarkPath[1] = '//td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr[3]/td';
+ shelfmarkPath[2] = '//td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr[4]/td';
+ shelfmarkPath[3] = '//td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr[5]/td';
+ var shelfmarkText;
+ if (authorModeObject==null||authorModeObject.innerHTML==null){
+ if (browseModeObject==null||browseModeObject.innerHTML==null){
+ for (var i=0; i < 4; i ++){
+ var shelfmarkObject = new Array();
+ var shelfmarkElements = new Array();
+ shelfmarkObject[i] = doc.evaluate(shelfmarkPath[i], doc, nsResolver, XPathResult.ANY_TYPE, null);
+ while (shelfmarkText = shelfmarkObject[i].iterateNext()) {
+ shelfmarkElements.push(shelfmarkText.textContent);
+ }
+ shelfmark[i]=shelfmarkElements[0]+" "+shelfmarkElements[1];
+ //Need to remove junk text scraped when there is a request button in the call number field.
+ shelfmark[i] = shelfmark[i].replace(/\s*\/*(?:xc_d.write.*\;)/, '');
+ }
+ } else if (browseModeObject.innerHTML.match(/arrow/)) {
+ for (var i=0; i < 4; i ++){
+ var shelfmarkObject = new Array();
+ var shelfmarkElements = new Array();
+ shelfmarkObject[i] = doc.evaluate(shelfmarkPath[i], doc, nsResolver, XPathResult.ANY_TYPE, null);
+ while (shelfmarkText = shelfmarkObject[i].iterateNext()) {
+ shelfmarkElements.push(shelfmarkText.textContent);
+ }
+ shelfmark[i]=shelfmarkElements[1]+" "+shelfmarkElements[2];
+ shelfmark[i] = shelfmark[i].replace(/\s*\/*(?:xc_d.write.*\;)/, '');
+ }
+ }
+ }else if (authorModeObject.innerHTML.match(/arrow/)){
+ for (var i=0; i < 4; i ++){
+ var shelfmarkObject = new Array();
+ var shelfmarkElements = new Array();
+ shelfmarkObject[i] = doc.evaluate(shelfmarkPath[i], doc, nsResolver, XPathResult.ANY_TYPE, null);
+ while (shelfmarkText = shelfmarkObject[i].iterateNext()) {
+ shelfmarkElements.push(shelfmarkText.textContent);
+ }
+ shelfmark[i]=shelfmarkElements[1]+" "+shelfmarkElements[2];
+ shelfmark[i] = shelfmark[i].replace(/\s*\/*(?:xc_d.write.*\;)/, '');
+ }
+ }
+ if (shelfmark[0] != "undefined undefined"){
+ callNumber = shelfmark[0];
+ }
+ for (var i=1; i<3; i++){
+ if (shelfmark[i] != "undefined undefined"){
+ callNumber = callNumber + "; " + shelfmark[i];
+ }
+ }
+ if (shelfmark[3] != "undefined undefined"){
+ callNumber = callNumber + ". See record for additional call numbers.";
+ }
+
+ item.callNumber = callNumber;
+
+ var link = getField (doc, 'Link to');
+ if (link.length == 0) {
+ var linkPath='//span[@class="text"]/table/tbody/tr/td/table/tbody/tr/td[2]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr/td[2]/font/span[@class="text"]/a';
+ var linkObject=doc.evaluate(linkPath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (linkObject==null){
+ } else {
+ var linkTitle=linkObject.textContent;
+ var linkLink=linkObject.href;
+ if (linkTitle=="Link to electronic text"){
+ link=linkLink;
+ }
+ }
+ }
+ item.url = link;
+
+ item.complete();
+ return '';
+}
+
+function doWeb(doc, url) {
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == "x" ) return namespace; else return null;
+ } : null;
+
+ var articles = new Array ();
+ var names = new Array ();
+ var items = new Object ();
+ var nextTitle;
+ doctype=detectWeb(doc, url);
+
+ /* Typically scrapers process both search pages and item pages in the same way;
+ the processDocuments function is used, calling the scraped result link URLs for a search page,
+ and for an item page calling the item page's own URL.
+ But Talis displays solo search results with an unstable URL and with no link to an item page.
+ So we cannot call the URL for a solo search result as it will yield a null page.
+ Instead we must process solo search results directly without using processDocuments.
+ We want to process item pages in the same way as solo search pages because
+ waiting for the URL on an item page to be called noticeably slows down the scrape.
+ */
+
+ var indexPath ='//span[@class="text"]/x:table/x:tbody/x:tr/x:td/x:table/x:tbody/x:tr/x:td[1]'
+ var index;
+ var indexElements = new Array();
+ var indexText;
+ var indexObject = doc.evaluate(indexPath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ while (indexText = indexObject.iterateNext()) {
+ indexElements.push(indexText.textContent);
+ }
+ index=indexElements[0];
+ index1=indexElements[1];
+ if (doctype == "multiple" && index.match(/Index/) && index1 == ""){
+ var titlePath = '//td[3]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr/td[1]/font/span[@class="text"]/a';
+ var titles = doc.evaluate(titlePath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ while (nextTitle = titles.iterateNext()) {
+ items[nextTitle.href] = nextTitle.textContent;
+ names.push(nextTitle.textContent);
+ }
+ items = Zotero.selectItems(items);
+ for (var i in items) {
+ articles.push(i);
+ }
+ Zotero.Utilities.processDocuments(articles, multiscrape, function(){Zotero.done();});
+
+ } else if (doctype == "multiple") {
+ var titlePath = '//td[4]/font/span[@class="text"]/table/tbody/tr/td/font/span[@class="text"]/table/tbody/tr/td[1]/font/span[@class="text"]/a';
+ var titles = doc.evaluate(titlePath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ while (nextTitle = titles.iterateNext()) {
+ items[nextTitle.href] = nextTitle.textContent;
+ names.push(nextTitle.textContent);
+ }
+ items = Zotero.selectItems(items);
+ for (var i in items) {
+ articles.push(i);
+ }
+ Zotero.Utilities.processDocuments(articles, multiscrape, function(){Zotero.done();});
+ }
+ else {
+ soloscrape(doc, url);
+ }
+ Zotero.wait();
+
+}
+\ No newline at end of file