commit 433ab89a8dd935de0ed5046ef9253694dc1ca39c
parent 85235da6b3a1d6055fc221b5fa5d6d8aaf81d25e
Author: Avram Lyon <ajlyon@gmail.com>
Date: Wed, 30 Mar 2011 15:16:24 +0000
Trans: Rewrite of IEEE Xplore
Diffstat:
1 file changed, 271 insertions(+), 37 deletions(-)
diff --git a/translators/IEEE Xplore.js b/translators/IEEE Xplore.js
@@ -2,9 +2,9 @@
"translatorID":"92d4ed84-8d0-4d3c-941f-d4b9124cfbb",
"translatorType":4,
"label":"IEEE Xplore",
- "creator":"Simon Kornblith, Michael Berkowitz and Bastian Koenings)",
- "target":"https?://[^/]*ieeexplore.ieee.org[^/]*/(?:[^\\?]+\\?(?:|.*&)arnumber=[0-9]+|search/(?:searchresult.jsp|selected.jsp))",
- "minVersion":"1.0.1",
+ "creator":"Simon Kornblith, Michael Berkowitz, Bastian Koenings, and Avram Lyon",
+ "target":"^https?://[^/]*ieeexplore\\.ieee\\.org[^/]*/(?:[^\\?]+\\?(?:|.*&)arnumber=[0-9]+|search/(?:searchresult.jsp|selected.jsp))",
+ "minVersion":"2.1",
"maxVersion":"",
"priority":100,
"inRepository":true,
@@ -37,7 +37,7 @@ function doWeb(doc, url) {
// search page
var items = new Array();
- var xPathRows = '//form[@id="search_results_form"]/ul[@class="Results"]/li[@class="noAbstract"]/div[@class="header"]';
+ var xPathRows = '//ul[@class="Results"]/li[@class="noAbstract"]/div[@class="header"]';
var tableRows = doc.evaluate(xPathRows, doc, nsResolver, XPathResult.ANY_TYPE, null);
var tableRow;
while(tableRow = tableRows.iterateNext()) {
@@ -61,41 +61,275 @@ function doWeb(doc, url) {
for(var url in items) {
urls.push(url);
}
+ Zotero.Utilities.processDocuments(urls, scrape, function () { Zotero.done(); });
+ Zotero.wait();
} else {
- var urls = [url];
+ scrape(doc, url);
}
+}
+
+function parseIdentifier(identifier) {
+ var idPieces = identifier.split(':');
+ if (idPieces.length > 1) {
+ var prefix = idPieces.shift();
+ switch (prefix.toLowerCase()) {
+ case "doi": return ["doi", idPieces.join(':')];
+ case "isbn": return ["isbn", idPieces.join(':')];
+ case "issn": return ["issn", idPieces.join(':')];
+ case "pmid": return ["pmid", idPieces.join(':')];
+ default: // do nothing
+ }
+ Zotero.debug("Unknown identifier prefix '"+prefix+"'");
+ return [prefix, idPieces.join(':')];
+ }
+ if (identifer.substr(0,3) == '10.') return ["doi", identifier];
+
+ // If we're here, we have a funny number, and we don't know what to do with it.
+ var ids = idCheck(identifier);
+ if (ids.isbn13) return ["isbn13", isbn13];
+ if (ids.isbn10) return ["isbn10", isbn10];
+ if (ids.issn) return ["issn", isbn10];
- for each(var url in urls) {
- var m = articleRe.exec(url);
- var post = "recordIds="+m[2]+"&fromPageName=searchabstract&citations-format=citation-abstract&download-format=download-ris&x=62&y=13";
- Zotero.Utilities.HTTP.doPost("http://ieeexplore.ieee.org/xpl/downloadCitations", post, function(text) {
- //handle DOI
- var doiregex = /DOI\s+-\s(.+)/;
- var doi = doiregex.exec(text);
-
- //replace journal abbreviation
- var jaregex = /JA\s+-\s(.+)/;
- var ja = jaregex.exec(text);
-
- // load translator for RIS
- var translator = Zotero.loadTranslator("import");
- translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7");
- translator.setString(text);
- translator.setHandler("itemDone", function(obj, item) {
- // abstracts are notes in Xplore's RIS
- if(item.notes[0] && item.notes[0].note) {
- item.abstractNote = item.notes[0].note;
- item.notes = new Array();
- }
- if(doi) { item.DOI = doi[1]; }
- if(ja) { item.journalAbbreviation = ja[1]; }
-
- pdfurl = "http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&arnumber="+m[2];
- item.attachments.push({url:pdfurl, title:"IEEE Xplore PDF", mimeType:"application/pdf"});
- item.complete();
- });
- translator.translate();
- });
+ return ["unknown", identifier];
+}
+
+function addIdentifier(identifier, item) {
+ var parsed = parseIdentifier(identifier);
+ switch (parsed[0]) {
+ case "doi": item.DOI = parsed[1]; break;
+ case "isbn": item.ISBN = parsed[1]; break;
+ case "isbn13": item.ISBN = parsed[1]; break;
+ case "isbn10": item.ISBN = parsed[1]; break;
+ case "issn": item.ISSN = parsed[1]; break;
+ default:
+ }
+}
+
+function scrape(doc,url)
+{
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace; else return null;
+ } : null;
+
+ var newItem=new Zotero.Item("journalArticle");
+ var temp;
+ var xpath;
+ var row;
+ var rows;
+
+ newItem.url = doc.location.href;
+ var metaTags = doc.getElementsByTagName("meta");
+
+ var pages = [false, false];
+ var doi = false;
+ var pdf = false;
+ var html = false;
+ for (var i = 0; i< metaTags.length; i++) {
+ var tag = metaTags[i].getAttribute("name");
+ var value = metaTags[i].getAttribute("content");
+ //Zotero.debug(pages + pdf + html);
+ //Zotero.debug("Have meta tag: " + tag + " => " + value);
+ switch (tag) {
+ // PRISM
+ case "prism.publicationName": newItem.publicationTitle = value; break;
+ case "prism.issn": if (!newItem.ISSN && value != "NaN" && value != "") newItem.ISSN = value; break;
+ case "prism.eIssn": if (!newItem.ISSN && value != "NaN" && value != "") newItem.ISSN = value; break;
+ // This is often NaN for some reason
+ case "prism.publicationDate": if (!newItem.date && value != "NaN" && value !== "") newItem.date = value; break;
+ case "prism.volume": if (!newItem.volume && value != "NaN" && value != "") newItem.volume = value; break;
+ case "prism.number": if (!newItem.issue && value != "NaN" && value != "") newItem.issue = value; break;
+ // These also seem bad
+ case "prism.startingPage": if(!pages[0] && value != "null" && value != "") pages[0] = value; break;
+ case "prism.endingPage": if(!pages[1] && value != "null" && value != "") pages[1] = value; break;
+ case "prism.number": newItem.issue = value; break;
+ // Google.
+ case "citation_journal_title": if (!newItem.publicationTitle) newItem.publicationTitle = value; break;
+ case "citation_authors":
+ // I'm a little concerned we'll see multiple copies of the author names...
+ for each(var author in value.split(';'))
+ newItem.creators.push(Zotero.Utilities.cleanAuthor(author, "author", true));
+ break;
+ case "citation_title": if (!newItem.title) newItem.title = value; break;
+ case "citation_publisher": if (!newItem.publisher) newItem.publisher = value; break;
+ case "citation_date": if (!newItem.date && value != "NaN" && value != "") newItem.date = value; break;
+ case "citation_year": if (!newItem.date && value != "NaN" && value != "") newItem.date = value; break;
+ case "citation_volume": if (!newItem.volume && value != "NaN" && value != "") newItem.volume = value; break;
+ case "citation_issue": if (!newItem.issue && value != "NaN" && value != "") newItem.issue = value; break;
+ case "citation_firstpage": if (!pages[0] && value != "NaN" && value != "") pages[0] = value; break;
+ case "citation_lastpage": if (!pages[1] && value != "NaN" && value != "") pages[1] = value; break;
+ case "citation_issn": if (!newItem.ISSN && value != "NaN" && value != "") newItem.ISSN = value; break;
+ case "citation_isbn": if (!newItem.ISBN && value != "NaN" && value != "") newItem.ISBN = value; break;
+ // Prefer long language names
+ case "citation_language": if ((!newItem.language || newItem.language.length < 4)
+ && value != "null" && value != "") newItem.language = value; break;
+ case "citation_doi": if (!newItem.DOI) newItem.DOI = value; break;
+ case "citation_abstract": newItem.abstractNote = value; break;
+ case "citation_abstract_html_url": newItem.url = value; break;
+ case "citation_pdf_url": if(!pdf) pdf = value; break;
+ case "citation_keywords": newItem.tags.push(value); break;
+ case "citation_fulltext_html_url": if(!pdf) pdf = value; break;
+ case "fulltext_pdf": if(!pdf) pdf = value; break;
+ // Dublin Core
+ case "dc.publisher": if(!newItem.publisher) newItem.publisher = value; break;
+ case "dc.language": if(!newItem.language) newItem.language = value; break;
+ case "dc.rights": if(!newItem.rights) newItem.rights = value; break;
+ case "dc.title": if(!newItem.title) newItem.title = value; break;
+ case "dc.creator": if(!newItem.creators.length == 0) newItem.creators.push(Zotero.Utilities.cleanAuthor(value)); break;
+ // This is often NaN for some reason
+ case "dc.date": if (!newItem.date && value != "NaN" && value !== "") newItem.date = value; break;
+ case "dc.identifier": addIdentifier(value, newItem); break;
+ default:
+ Zotero.debug("Ignoring meta tag: " + tag + " => " + value);
+ }
+ }
+
+ if (pdf) newItem.attachments = [{url:pdf, title:"IEEE Xplore Full Text PDF", mimeType:"application/pdf"}];
+ if (html) newItem.attachments = [{url:html, title:"IEEE Xplore Full Text HTML"}];
+
+ if (pages[0] && pages[1]) newItem.pages = pages.join('-')
+ else newItem.pages = pages[0] ? pages[1] : (pages[1] ? pages[1] : "");
+
+ // Abstracts don't seem to come with
+ if (!newItem.abstractNote) {
+ var abstractNode = doc.evaluate('//a[@name="Abstract"]/following-sibling::p[1]', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (abstractNode) newItem.abstractNote = abstractNode.textContent;
}
- Zotero.wait();
+ newItem.complete();
+}
+
+// Implementation of ISBN and ISSN check-digit verification
+// Based on ISBN Users' Manual (http://www.isbn.org/standards/home/isbn/international/html/usm4.htm)
+// and the Wikipedia treatment of ISBN (http://en.wikipedia.org/wiki/International_Standard_Book_Number)
+// and the Wikipedia treatment of ISSN (http://en.wikipedia.org/wiki/International_Standard_Serial_Number)
+
+// This will also check ISMN validity, although it does not distinguish from their
+// neighbors in namespace, ISBN-13. It does not handle pre-2008 M-prefixed ISMNs; see
+// http://en.wikipedia.org/wiki/International_Standard_Music_Number
+
+// This does not validate multiple identifiers in one field,
+// but it will gracefully ignore all non-number detritus,
+// such as extraneous hyphens, spaces, and comments.
+
+// It currently maintains hyphens in non-initial and non-final position,
+// discarding consecutive ones beyond the first as well.
+
+// It also adds the customary hyphen to valid ISSNs.
+
+// Takes the first 8 valid digits and tries to read an ISSN,
+// takes the first 10 valid digits and tries to read an ISBN 10,
+// and takes the first 13 valid digits to try to read an ISBN 13
+// Returns an object with four attributes:
+// "issn"
+// "isbn10"
+// "isbn13"
+// Each will be set to a valid identifier if found, and otherwise be a
+// boolean false.
+
+// There could conceivably be a valid ISBN-13 with an ISBN-10
+// substring; this should probably be interpreted as the latter, but it is a
+idCheck = function(isbn) {
+ // For ISBN 10, multiple by these coefficients, take the sum mod 11
+ // and subtract from 11
+ var isbn10 = [10, 9, 8, 7, 6, 5, 4, 3, 2];
+
+ // For ISBN 13, multiple by these coefficients, take the sum mod 10
+ // and subtract from 10
+ var isbn13 = [1, 3, 1, 3, 1, 3, 1, 3, 1, 3, 1, 3];
+
+ // For ISSN, multiply by these coefficients, take the sum mod 11
+ // and subtract from 11
+ var issn = [8, 7, 6, 5, 4, 3, 2];
+
+ // We make a single pass through the provided string, interpreting the
+ // first 10 valid characters as an ISBN-10, and the first 13 as an
+ // ISBN-13. We then return an array of booleans and valid detected
+ // ISBNs.
+
+ var j = 0;
+ var sum8 = 0;
+ var num8 = "";
+ var sum10 = 0;
+ var num10 = "";
+ var sum13 = 0;
+ var num13 = "";
+ var chars = [];
+
+ for (var i=0; i < isbn.length; i++) {
+ if (isbn.charAt(i) == " ") {
+ // Since the space character evaluates as a number,
+ // it is a special case.
+ } else if (j > 0 && isbn.charAt(i) == "-" && isbn.charAt(i-1) != "-") {
+ // Preserve hyphens, except in initial and final position
+ // Also discard consecutive hyphens
+ if(j < 7) num8 += "-";
+ if(j < 10) num10 += "-";
+ if(j < 13) num13 += "-";
+ } else if (j < 7 && ((isbn.charAt(i) - 0) == isbn.charAt(i))) {
+ sum8 += isbn.charAt(i) * issn[j];
+ sum10 += isbn.charAt(i) * isbn10[j];
+ sum13 += isbn.charAt(i) * isbn13[j];
+ num8 += isbn.charAt(i);
+ num10 += isbn.charAt(i);
+ num13 += isbn.charAt(i);
+ j++;
+ } else if (j == 7 &&
+ (isbn.charAt(i) == "X" || isbn.charAt(i) == "x" ||
+ ((isbn.charAt(i) - 0) == isbn.charAt(i)))) {
+ // In ISSN, an X represents the check digit "10".
+ if(isbn.charAt(i) == "X" || isbn.charAt(i) == "x") {
+ var check8 = 10;
+ num8 += "X";
+ } else {
+ var check8 = isbn.charAt(i);
+ sum10 += isbn.charAt(i) * isbn10[j];
+ sum13 += isbn.charAt(i) * isbn13[j];
+ num8 += isbn.charAt(i);
+ num10 += isbn.charAt(i);
+ num13 += isbn.charAt(i);
+ j++;
+ }
+ } else if (j < 9 && ((isbn.charAt(i) - 0) == isbn.charAt(i))) {
+ sum10 += isbn.charAt(i) * isbn10[j];
+ sum13 += isbn.charAt(i) * isbn13[j];
+ num10 += isbn.charAt(i);
+ num13 += isbn.charAt(i);
+ j++;
+ } else if (j == 9 &&
+ (isbn.charAt(i) == "X" || isbn.charAt(i) == "x" ||
+ ((isbn.charAt(i) - 0) == isbn.charAt(i)))) {
+ // In ISBN-10, an X represents the check digit "10".
+ if(isbn.charAt(i) == "X" || isbn.charAt(i) == "x") {
+ var check10 = 10;
+ num10 += "X";
+ } else {
+ var check10 = isbn.charAt(i);
+ sum13 += isbn.charAt(i) * isbn13[j];
+ num10 += isbn.charAt(i);
+ num13 += isbn.charAt(i);
+ j++;
+ }
+ } else if(j < 12 && ((isbn.charAt(i) - 0) == isbn.charAt(i))) {
+ sum13 += isbn.charAt(i) * isbn13[j];
+ num13 += isbn.charAt(i);
+ j++;
+ } else if (j == 12 && ((isbn.charAt(i) - 0) == isbn.charAt(i))) {
+ var check13 = isbn.charAt(i);
+ num13 += isbn.charAt(i);
+ }
+ }
+ var valid8 = ((11 - sum8 % 11) % 11) == check8;
+ var valid10 = ((11 - sum10 % 11) % 11) == check10;
+ var valid13 = (10 - sum13 % 10 == check13);
+ var matches = false;
+
+ // Since ISSNs have a standard hyphen placement, we can add a hyphen
+ if (valid8 && (matches = num8.match(/([0-9]{4})([0-9]{3}[0-9Xx])/))) {
+ num8 = matches[1] + '-' + matches[2];
+ }
+
+ if(!valid8) {num8 = false};
+ if(!valid10) {num10 = false};
+ if(!valid13) {num13 = false};
+ return {"isbn10" : num10, "isbn13" : num13, "issn" : num8};
}