commit 1f0d24ceef961014c673a9fe646968578f4384b2
parent 9ca461c59b4dd73f35db16161d60cba49dc6e2c8
Author: Simon Kornblith <simon@simonster.com>
Date: Tue, 24 Mar 2009 02:08:08 +0000
- use DOIs for PDF metadata lookup when available (needs testing)
- fix accessibility of createContextObject in Zotero.Utilities
- improved CrossRef translator
Diffstat:
3 files changed, 152 insertions(+), 78 deletions(-)
diff --git a/chrome/content/zotero/recognizePDF.js b/chrome/content/zotero/recognizePDF.js
@@ -26,6 +26,7 @@
const Zotero_RecognizePDF_SUCCESS_IMAGE = "chrome://zotero/skin/tick.png";
const Zotero_RecognizePDF_FAILURE_IMAGE = "chrome://zotero/skin/cross.png";
const Zotero_RecognizePDF_LOADING_IMAGE = "chrome://zotero/skin/indicator.gif";
+const DOIre = /\bdoi\: *([^\s]+)/i;
/**
* Front end for recognizing PDFs
@@ -309,6 +310,16 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, callback, ca
}
}
+ inputStream.close();
+ cacheFile.remove(false);
+
+ // look for DOI
+ var allText = lines.join("\n");
+ var m = DOIre.exec(allText);
+ if(m) {
+ this._DOI = m[1];
+ }
+
// get (not quite) median length
var lineLengthsLength = lineLengths.length;
if(lineLengthsLength < 20) {
@@ -328,9 +339,6 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, callback, ca
this._startLine = this._iteration = 0;
}
- inputStream.close();
- cacheFile.remove(false);
-
if(lineLengthsLength >= 20) {
this._queryGoogle();
}
@@ -349,53 +357,67 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
return;
}
this._iteration++;
-
- // take the relevant parts of some lines (exclude hyphenated word)
- var queryStringWords = 0;
+
var queryString = "";
- while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
- var words = this._goodLines[this._startLine].split(/\s+/);
- // get rid of first and last words
- words.shift();
- words.pop();
- // make sure there are no long words (probably OCR mistakes)
- var skipLine = false;
- for(var i=0; i<words.length; i++) {
- if(words[i].length > 20) {
- skipLine = true;
- break;
+ var me = this;
+ if(this._DOI) {
+ // use CrossRef to look for DOI
+ translate = new Zotero.Translate("search", true, false);
+ translate.setTranslator("11645bd1-0420-45c1-badb-53fb41eeb753");
+ var item = {"itemType":"journalArticle", "DOI":this._DOI};
+ translate.setSearch(item);
+ translate.setHandler("itemDone", function(translate, item) { me._callback(item); });
+ translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) });
+ translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); });
+ translate.translate();
+ delete this._DOI;
+ } else {
+ // take the relevant parts of some lines (exclude hyphenated word)
+ var queryStringWords = 0;
+ while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
+ var words = this._goodLines[this._startLine].split(/\s+/);
+ // get rid of first and last words
+ words.shift();
+ words.pop();
+ // make sure there are no long words (probably OCR mistakes)
+ var skipLine = false;
+ for(var i=0; i<words.length; i++) {
+ if(words[i].length > 20) {
+ skipLine = true;
+ break;
+ }
}
+ // add words to query
+ if(!skipLine && words.length) {
+ queryStringWords += words.length;
+ queryString += '"'+words.join(" ")+'" ';
+ }
+ this._startLine++;
}
- // add words to query
- if(!skipLine && words.length) {
- queryStringWords += words.length;
- queryString += '"'+words.join(" ")+'" ';
+
+ Zotero.debug("RecognizePDF: Query string "+queryString);
+
+ // pass query string to Google Scholar and translate
+ var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search";
+ if(!this._hiddenBrowser) {
+ this._hiddenBrowser = Zotero.Browser.createHiddenBrowser();
+ this._hiddenBrowser.docShell.allowImages = false;
}
- this._startLine++;
- }
- Zotero.debug("RecognizePDF: Query string "+queryString);
-
- // pass query string to Google Scholar and translate
- var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search";
- if(!this._hiddenBrowser) {
- this._hiddenBrowser = Zotero.Browser.createHiddenBrowser();
- this._hiddenBrowser.docShell.allowImages = false;
+
+ var translate = new Zotero.Translate("web", true, false);
+ translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
+ translate.setHandler("itemDone", function(translate, item) {
+ Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
+ me._callback(item);
+ });
+ translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) });
+ translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); });
+
+ this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true);
+
+ this._hiddenBrowser.loadURIWithFlags(url,
+ Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null);
}
-
- var me = this;
- var translate = new Zotero.Translate("web", true, false);
- translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
- translate.setHandler("itemDone", function(translate, item) {
- Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
- me._callback(item);
- });
- translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) });
- translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); });
-
- this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true);
-
- this._hiddenBrowser.loadURIWithFlags(url,
- Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, null, null, null);
}
/**
diff --git a/chrome/content/zotero/xpcom/utilities.js b/chrome/content/zotero/xpcom/utilities.js
@@ -581,7 +581,7 @@ Zotero.Utilities.prototype.processAsync = function (sets, callbacks, onDone) {
* @borrows Zotero.Date.formatDate as this.formatDate
* @borrows Zotero.Date.strToDate as this.strToDate
* @borrows Zotero.Date.strToISO as this.strToISO
- * @borrows Zotero.OpenURL.lookupContextObject as this.lookupContextObject
+ * @borrows Zotero.OpenURL.createContextObject as this.createContextObject
* @borrows Zotero.OpenURL.parseContextObject as this.parseContextObject
* @borrows Zotero.Utilities.HTTP.processDocuments as this.processDocuments
* @borrows Zotero.Utilities.HTTP.doPost as this.doPost
@@ -596,7 +596,7 @@ Zotero.Utilities.Translate.prototype.inArray = Zotero.inArray;
Zotero.Utilities.Translate.prototype.formatDate = Zotero.Date.formatDate;
Zotero.Utilities.Translate.prototype.strToDate = Zotero.Date.strToDate;
Zotero.Utilities.Translate.prototype.strToISO = Zotero.Date.strToISO;
-Zotero.Utilities.Translate.prototype.lookupContextObject = Zotero.OpenURL.lookupContextObject;
+Zotero.Utilities.Translate.prototype.createContextObject = Zotero.OpenURL.createContextObject;
Zotero.Utilities.Translate.prototype.parseContextObject = Zotero.OpenURL.parseContextObject;
/**
diff --git a/translators/CrossRef.js b/translators/CrossRef.js
@@ -18,11 +18,17 @@ function detectSearch(item) {
return false;
}
+function fixAuthorCapitalization(string) {
+ if(string.toUpperCase() == string) {
+ string = string.toLowerCase().replace(/\b[a-z]/g, function(m) { return m[0].toUpperCase() });
+ }
+ return string;
+}
+
function processCrossRef(xmlOutput) {
xmlOutput = xmlOutput.replace(/<\?xml[^>]*\?>/, "");
// parse XML with E4X
- var qr = new Namespace("http://www.crossref.org/qrschema/2.0");
try {
var xml = new XML(xmlOutput);
} catch(e) {
@@ -30,41 +36,87 @@ function processCrossRef(xmlOutput) {
}
// ensure status is valid
- var status = xml.qr::query_result.qr::body.qr::query.@status.toString();
- if(status != "resolved" && status != "multiresolved") {
- return false;
+ if(!xml.doi_record.length()) return false;
+ if(xml.doi_record[0].crossref.journal.length()) {
+ var item = new Zotero.Item("journalArticle");
+ var itemXML = xml.doi_record.crossref.journal;
+ var refXML = itemXML.journal_article;
+ var metadataXML = itemXML.journal_metadata;
+
+ item.ISSN = itemXML.journal_metadata.issn.toString();
+ item.publicationTitle = itemXML.journal_metadata.full_title.toString();
+ item.journalAbbreviation = itemXML.journal_metadata.abbrev_title.toString();
+ item.volume = itemXML.journal_issue.journal_volume.volume.toString();
+ item.issue = itemXML.journal_issue.issue.toString();
+ } else if(xml.doi_record[0].crossref.book.length()) {
+ var item = new Zotero.Item("book");
+ var refXML = xml.doi_record[0].crossref.book.book_metadata;
+ var metadataXML = refXML;
+ var seriesXML = metadataXML.series_metadata;
+
+ item.place = metadataXML.publisher.publisher_place.toString();
+ } else if(xml.doi_record[0].crossref.conference.length()) {
+ var item = new Zotero.Item("conferencePaper");
+ var itemXML = xml.doi_record[0].crossref.conference;
+ var refXML = itemXML.conference_paper;
+ var metadataXML = itemXML.proceedingsMetadata;
+ var seriesXML = metadataXML.series_metadata;
+
+ item.publicationTitle = itemXML.proceedings_metadata.proceedings_title.toString();
+ item.place = itemXML.event_metadata.conference_location.toString();
+ item.conferenceName = itemXML.event_metadata.conference_name.toString();
+ }
+
+ var contributors = refXML.contributors.children();
+
+ if(metadataXML.isbn.length()) item.ISBN = metadataXML.isbn[0].toString();
+ if(metadataXML.issn.length()) item.ISSN = metadataXML.issn[0].toString();
+ item.publisher = metadataXML.publisher.publisher_name.toString();
+ item.edition = metadataXML.edition_number.toString();
+ if(!item.volume) item.volume = metadataXML.volume.toString();
+
+ if(seriesXML && seriesXML.length()) {
+ if(seriesXML.contributors.length()) {
+ contributors += seriesXML.contributors.children();
+ }
+ item.seriesNumber = seriesXML.series_number.toString();
}
- var query = xml.qr::query_result.qr::body.qr::query;
- var item = new Zotero.Item("journalArticle");
+ for each(var creatorXML in contributors) {
+ var creator = {creatorType:"author"};
+ if(creatorXML.contributor_role == "editor") {
+ creator.creatorType = "editor";
+ } else if(creatorXML.contributor_role == "translator") {
+ creator.creatorType = "translator";
+ } else if(creatorXML.contributor_role == "chair") {
+ creator.creatorType = "contributor";
+ }
+
+ if(creatorXML.localName() == "organization") {
+ creator.fieldMode = 1;
+ creator.lastName = creatorXML.toString();
+ } else if(creatorXML.localName() == "person_name") {
+ creator.firstName = fixAuthorCapitalization(creatorXML.given_name.toString());
+ creator.lastName = fixAuthorCapitalization(creatorXML.surname.toString());
+ }
+ item.creators.push(creator);
+ }
- // try to get a DOI
- item.DOI = query.qr::doi.(@type=="journal_article").text().toString();
- if(!item.DOI) {
- item.DOI = query.qr::doi.(@type=="book_title").text().toString();
+ item.date = refXML.publication_date.year.toString();
+ if(refXML.publication_date.month.length()) {
+ item.date = refXML.publication_date.month.toString()+"/"+item.date;
}
- if(!item.DOI) {
- item.DOI = query.qr::doi.(@type=="book_content").text().toString();
+
+ if(refXML.pages.length()) {
+ item.pages = refXML.pages.first_page.toString();
+ if(refXML.pages.last_page.length()) {
+ item.pages += "-"+refXML.pages.last_page.toString();
+ }
}
- // try to get an ISSN (no print/electronic preferences)
- item.ISSN = query.qr::issn[0].text().toString();
- // get title
- item.title = query.qr::article_title.text().toString();
- // get publicationTitle
- item.publicationTitle = query.qr::journal_title.text().toString();
- // get author
- item.creators.push(Zotero.Utilities.cleanAuthor(query.qr::author.text().toString(), "author", true));
- // get volume
- item.volume = query.qr::volume.text().toString();
- // get issue
- item.issue = query.qr::issue.text().toString();
- // get year
- item.date = query.qr::year.text().toString();
- // get edition
- item.edition = query.qr::edition_number.text().toString();
- // get first page
- item.pages = query.qr::first_page.text().toString();
+ item.DOI = refXML.doi_data.doi.toString();
+ item.url = refXML.doi_data.resource.toString();
+ item.title = refXML.titles.title.toString();
item.complete();
return true;
@@ -80,7 +132,7 @@ function doSearch(item) {
var co = Zotero.Utilities.createContextObject(item);
}
- Zotero.Utilities.HTTP.doGet("http://www.crossref.org/openurl?req_dat=zter:zter321&"+co+"&noredirect=true", function(responseText) {
+ Zotero.Utilities.HTTP.doGet("http://www.crossref.org/openurl?req_dat=zter:zter321&"+co+"&noredirect=true&format=unixref", function(responseText) {
processCrossRef(responseText);
Zotero.done();
});