commit 71a37511797b232a58de4a699a9d7df8a8286cbd
parent 121b75ef6c3fbddf72bc20cdd88a20fbea97ecc5
Author: Simon Kornblith <simon@simonster.com>
Date: Fri, 31 Jan 2014 00:16:09 -0800
Merge pull request #433 from aurimasv/retrieve-meta
Retrieve Metadata query limit fixes
Diffstat:
7 files changed, 671 insertions(+), 177 deletions(-)
diff --git a/chrome/content/zotero/captcha.js b/chrome/content/zotero/captcha.js
@@ -0,0 +1,73 @@
+/*
+ ***** BEGIN LICENSE BLOCK *****
+
+ Copyright © 2009 Center for History and New Media
+ George Mason University, Fairfax, Virginia, USA
+ http://zotero.org
+
+ This file is part of Zotero.
+
+ Zotero is free software: you can redistribute it and/or modify
+ it under the terms of the GNU Affero General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ Zotero is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU Affero General Public License for more details.
+
+ You should have received a copy of the GNU Affero General Public License
+ along with Zotero. If not, see <http://www.gnu.org/licenses/>.
+
+ ***** END LICENSE BLOCK *****
+*/
+
+var Zotero_Captcha = new function() {
+ this._io;
+
+ this.onLoad = function() {
+ this._io = window.arguments[0];
+ var description = document.getElementById('zotero-captcha-description'),
+ errorMsg = document.getElementById('zotero-captcha-error');
+
+ if(this._io.dataIn.title) {
+ document.title = this._io.dataIn.title;
+ }
+
+ if(this._io.dataIn.description) {
+ description.textContent = this._io.dataIn.description;
+ description.hidden = false;
+ } else {
+ description.hidden = true;
+ }
+
+ if(this._io.dataIn.error) {
+ errorMsg.textContent = this._io.dataIn.error;
+ errorMsg.hidden = false;
+ } else {
+ errorMsg.hidden = true;
+ }
+
+ document.getElementById('zotero-captcha-image').src = this._io.dataIn.imgUrl;
+ document.getElementById('zotero-captcha-input').focus();
+ }
+
+ this.imageOnLoad = function() {
+ window.sizeToContent();
+ }
+
+ this.resolve = function() {
+ var result = document.getElementById('zotero-captcha-input');
+ if(!result.value) return;
+
+ this._io.dataOut = {
+ captcha: result.value
+ };
+ window.close();
+ }
+
+ this.cancel = function() {
+ window.close();
+ }
+}
+\ No newline at end of file
diff --git a/chrome/content/zotero/captcha.xul b/chrome/content/zotero/captcha.xul
@@ -0,0 +1,27 @@
+<?xml version="1.0"?>
+
+<?xml-stylesheet href="chrome://global/skin/" type="text/css"?>
+<?xml-stylesheet href="chrome://zotero/skin/zotero.css" type="text/css"?>
+
+<!DOCTYPE window SYSTEM "chrome://zotero/locale/zotero.dtd">
+
+<window xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
+ onload="Zotero_Captcha.onLoad();"
+ id="zotero-captcha"
+ onkeypress="if(event.keyCode === KeyEvent.DOM_VK_ESCAPE) Zotero_Captcha.cancel();">
+
+ <script src="include.js"/>
+ <script src="captcha.js"/>
+
+ <vbox style="padding:10px" align="center" flex="1">
+ <description id="zotero-captcha-description"></description>
+ <image id="zotero-captcha-image" onload="Zotero_Captcha.imageOnLoad();" />
+ <description id="zotero-captcha-error"></description>
+ <textbox id="zotero-captcha-input"
+ onkeypress="if(event.keyCode === KeyEvent.DOM_VK_RETURN) Zotero_Captcha.resolve();" />
+ <hbox>
+ <button label="&zotero.general.ok;" default="true" oncommand="Zotero_Captcha.resolve();" />
+ <button label="&zotero.general.cancel;" oncommand="Zotero_Captcha.cancel();" />
+ </hbox>
+ </vbox>
+</window>
diff --git a/chrome/content/zotero/pdfProgress.xul b/chrome/content/zotero/pdfProgress.xul
@@ -14,7 +14,9 @@
<tree flex="1" id="tree" hidecolumnpicker="true">
<treecols>
<treecol id="success-col" style="width:20px;"/>
+ <splitter class="tree-splitter" hidden="true"/>
<treecol label="&zotero.recognizePDF.pdfName.label;" id="pdf-col" flex="1"/>
+ <splitter class="tree-splitter"/>
<treecol label="&zotero.recognizePDF.itemName.label;" id="item-col" flex="2"/>
</treecols>
<treechildren id="treechildren"/>
diff --git a/chrome/content/zotero/recognizePDF.js b/chrome/content/zotero/recognizePDF.js
@@ -65,19 +65,32 @@ var Zotero_RecognizePDF = new function() {
*
* @param {nsIFile} file The PDF file to retrieve metadata for
* @param {Integer|null} libraryID The library in which to save the PDF
+ * @param {Function} stopCheckCallback Function that returns true if the
+ * process is to be interrupted
* @return {Promise} A promise resolved when PDF metadata has been retrieved
*/
- this.recognize = function(file, libraryID) {
+ this.recognize = function(file, libraryID, stopCheckCallback) {
const MAX_PAGES = 7;
- const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // in ms
+ var me = this;
return _extractText(file, MAX_PAGES).then(function(lines) {
// Look for DOI - Use only first 80 lines to avoid catching article references
var allText = lines.join("\n"),
- doi = Zotero.Utilities.cleanDOI(lines.slice(0,80).join('\n')),
+ firstChunk = lines.slice(0,80).join('\n'),
+ doi = Zotero.Utilities.cleanDOI(firstChunk),
promise;
Zotero.debug(allText);
+ if(!doi) {
+ // Look for a JSTOR stable URL, which can be converted to a DOI by prepending 10.2307
+ doi = firstChunk.match(/www.\jstor\.org\/stable\/(\S+)/i);
+ if(doi) {
+ doi = Zotero.Utilities.cleanDOI(
+ doi[1].indexOf('10.') == 0 ? doi[1] : '10.2307/' + doi[1]
+ );
+ }
+ }
+
if(doi) {
// Look up DOI
Zotero.debug("RecognizePDF: Found DOI: "+doi);
@@ -104,118 +117,7 @@ var Zotero_RecognizePDF = new function() {
// If no DOI or ISBN, query Google Scholar
return promise.fail(function(error) {
Zotero.debug("RecognizePDF: "+error);
-
- // Use only first column from multi-column lines
- const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/;
- var cleanedLines = [], cleanedLineLengths = [];
- for(var i=0; i<lines.length && cleanedLines.length<100; i++) {
- var m = lineRe.exec(lines[i]);
- if(m && m[1].split(' ').length > 3) {
- cleanedLines.push(m[1]);
- cleanedLineLengths.push(m[1].length);
- }
- }
-
- // get (not quite) median length
- var lineLengthsLength = cleanedLineLengths.length;
- if(lineLengthsLength < 20
- || cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") {
- throw new Zotero.Exception.Alert("recognizePDF.noOCR");
- }
-
- var sortedLengths = cleanedLineLengths.sort(),
- medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
-
- // pick lines within 6 chars of the median (this is completely arbitrary)
- var goodLines = [],
- uBound = medianLength + 6,
- lBound = medianLength - 6;
- for (var i=0; i<lineLengthsLength; i++) {
- if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) {
- // Strip quotation marks so they don't mess up search query quoting
- var line = cleanedLines[i].replace('"', '');
- goodLines.push(line);
- }
- }
-
- var nextLine = 0,
- limited = false,
- queryGoogle = function() {
- // Once we hit the CAPTCHA once, don't keep trying
- if(limited) throw new Zotero.Exception.Alert("recognizePDF.limit");
-
- // Take the relevant parts of some lines (exclude hyphenated word)
- var queryString = "", queryStringWords = 0;
- while(queryStringWords < 25) {
- if(!goodLines.length) throw new Zotero.Exception.Alert("recognizePDF.noMatches");
-
- var words = goodLines.splice(nextLine, 1)[0].split(/\s+/);
- // Try to avoid picking adjacent strings so the odds of them appearing in another
- // document quoting our document is low. Every 7th line is a magic value
- nextLine = (nextLine + 7) % goodLines.length;
-
- // get rid of first and last words
- words.shift();
- words.pop();
- // make sure there are no long words (probably OCR mistakes)
- var skipLine = false;
- for(var i=0; i<words.length; i++) {
- if(words[i].length > 20) {
- skipLine = true;
- break;
- }
- }
- // add words to query
- if(!skipLine && words.length) {
- queryStringWords += words.length;
- queryString += '"'+words.join(" ")+'" ';
- }
- }
-
- Zotero.debug("RecognizePDF: Query string "+queryString);
-
- var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search",
- delay = GOOGLE_SCHOLAR_QUERY_DELAY - (Date.now() - Zotero.HTTP.lastGoogleScholarQueryTime);
-
- // Delay
- return (delay > 0 ? Q.delay(delay) : Q.when())
- .then(function() {
- Zotero.HTTP.lastGoogleScholarQueryTime = Date.now();
- return Zotero.HTTP.promise("GET", url, {"responseType":"document"})
- })
- .then(function(xmlhttp) {
- var doc = xmlhttp.response,
- deferred = Q.defer(),
- translate = new Zotero.Translate.Web();
-
- if(Zotero.Utilities.xpath(doc, "//form[@action='Captcha']").length) {
- // Hit CAPTCHA
- limited = true;
- throw new Zotero.Exception.Alert("recognizePDF.limit");
- }
-
- translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
- translate.setDocument(Zotero.HTTP.wrapDocument(doc, url));
- translate.setHandler("translators", function(translate, detected) {
- if(detected.length) {
- deferred.resolve(_promiseTranslate(translate, libraryID));
- } else {
- deferred.reject(new Zotero.Exception.Alert("recognizePDF.noMatches"));
- }
- });
- translate.getTranslators();
-
- return deferred.promise;
- }, function(e) {
- if(e instanceof Zotero.HTTP.UnexpectedStatusException && e.status == 403) {
- // Hit hard block
- throw new Zotero.Exception.Alert("recognizePDF.limit");
- }
- throw e;
- });
- };
-
- return queryGoogle().fail(queryGoogle).fail(queryGoogle);
+ return me.GSFullTextSearch.findItem(lines, libraryID, stopCheckCallback);
});
});
}
@@ -331,41 +233,15 @@ var Zotero_RecognizePDF = new function() {
}
// Validate ISBNs
- var validIsbns = [];
+ var validIsbns = [], cleanISBN;
for (var i =0; i < isbns.length; i++) {
- if(_isValidISBN(isbns[i])) validIsbns.push(isbns[i]);
+ cleanISBN = Zotero.Utilities.cleanISBN(isbns[i]);
+ if(cleanISBN) validIsbns.push(cleanISBN);
}
return validIsbns;
}
/**
- * Check whether an ISBNs is valid
- * @private
- * @return {Boolean}
- */
- function _isValidISBN(isbn) {
- if(isbn.length == 13) {
- // ISBN-13 should start with 978 or 979 i.e. GS1 for book publishing industry
- var prefix = isbn.slice(0,3);
- if (prefix != "978" && prefix != "979") return false;
- // Verify check digit
- var check = 0;
- for (var i = 0; i < 13; i+=2) check += isbn[i]*1;
- for (i = 1; i < 12; i+=2) check += 3 * isbn[i]*1;
- return (check % 10 == 0);
- } else if(isbn.length == 10) {
- // Verify ISBN-10 check digit
- var check = 0;
- for (var i = 0; i < 9; i++) check += isbn[i]*1 * (10-i);
- // last number might be 'X'
- if (isbn[9] == 'X' || isbn[9] == 'x') check += 10;
- else check += isbn[i]*1;
- return (check % 11 == 0);
- }
- return false;
- }
-
- /**
* @class Handles UI, etc. for recognizing multiple items
*/
this.ItemRecognizer = function () {
@@ -388,7 +264,7 @@ var Zotero_RecognizePDF = new function() {
this._items = items.slice();
this._itemTotal = items.length;
- this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen");
+ _progressWindow = this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen");
this._progressWindow.addEventListener("pageshow", function() { me._onWindowLoaded() }, false);
},
@@ -398,7 +274,15 @@ var Zotero_RecognizePDF = new function() {
"stop": function() {
this._stopped = true;
},
-
+
+ /**
+ * Halts recognition and closes window
+ */
+ "close": function() {
+ this.stop();
+ this._progressWindow.close();
+ },
+
/**
* Called when the progress window has been opened; adds items to the tree and begins recognizing
* @param
@@ -406,9 +290,11 @@ var Zotero_RecognizePDF = new function() {
"_onWindowLoaded": function() {
// populate progress window
var treechildren = this._progressWindow.document.getElementById("treechildren");
+ this._rowIDs = [];
for(var i in this._items) {
var treeitem = this._progressWindow.document.createElement('treeitem');
var treerow = this._progressWindow.document.createElement('treerow');
+ this._rowIDs.push(this._items[i].id);
var treecell = this._progressWindow.document.createElement('treecell');
treecell.setAttribute("id", "item-"+this._items[i].id+"-icon");
@@ -427,12 +313,22 @@ var Zotero_RecognizePDF = new function() {
}
var me = this;
- this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator");
- this._progressWindow.document.getElementById("cancel-button").addEventListener("command", function() {
- me.stop();
- me._progressWindow.close();
- }, false);
- this._progressWindow.addEventListener("close", function() { me.stop() }, false);
+
+ this._progressWindow.document.getElementById("tree").addEventListener(
+ "dblclick", function(event) { me._onDblClick(event, this); });
+
+ this._cancelHandler = function() { me.stop() };
+ this._keypressCancelHandler = function(e) {
+ if(e.keyCode === KeyEvent.DOM_VK_ESCAPE) me.stop();
+ };
+
+ _progressIndicator = this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator");
+ this._progressWindow.document.getElementById("cancel-button")
+ .addEventListener("command", this._cancelHandler, false);
+ // Also cancel if the user presses Esc
+ this._progressWindow.addEventListener("keypress", this._keypressCancelHandler);
+ this._progressWindow.addEventListener("close", this._cancelHandler, false);
+ Zotero_RecognizePDF.GSFullTextSearch.resetQueryLimit();
this._recognizeItem();
},
@@ -452,23 +348,31 @@ var Zotero_RecognizePDF = new function() {
return;
}
+ // Order here matters. Otherwise we may show an incorrect label
+ if(this._stopped) {
+ this._done(true);
+ return;
+ }
+
this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100;
var item = this._items.shift(),
itemIcon = this._progressWindow.document.getElementById("item-"+item.id+"-icon"),
- itemTitle = this._progressWindow.document.getElementById("item-"+item.id+"-title");
+ itemTitle = this._progressWindow.document.getElementById("item-"+item.id+"-title"),
+ rowNumber = this._rowIDs.indexOf(item.id);
itemIcon.setAttribute("src", LOADING_IMAGE);
+ itemTitle.setAttribute("label", "");
var file = item.getFile(), me = this;
(file
- ? Zotero_RecognizePDF.recognize(file, item.libraryID)
+ ? Zotero_RecognizePDF.recognize(file, item.libraryID, function() { return me._stopped; })
: Q.reject(new Zotero.Exception.Alert("recognizePDF.fileNotFound")))
.then(function(newItem) {
// If already stopped, delete
if(me._stopped) {
- Zotero.Items.erase(item.id);
- return;
+ Zotero.Items.erase(newItem.id);
+ throw new Zotero.Exception.Alert('recognizePDF.stopped');
}
// put new item in same collections as the old one
@@ -484,37 +388,504 @@ var Zotero_RecognizePDF = new function() {
itemTitle.setAttribute("label", newItem.getField("title"));
itemIcon.setAttribute("src", SUCCESS_IMAGE);
+ me._rowIDs[rowNumber] = newItem.id;
me._recognizeItem();
- }, function(error) {
+ })
+ .catch(function(error) {
Zotero.debug(error);
Zotero.logError(error);
itemTitle.setAttribute("label", error instanceof Zotero.Exception.Alert ? error.message : Zotero.getString("recognizePDF.error"));
itemIcon.setAttribute("src", FAILURE_IMAGE);
- if(error instanceof Zotero.Exception.Alert && error.name === "recognizePDF.limit") {
- me._done();
+ // Don't show "completed" label if stopped on last item
+ if(me._stopped && !me._items.length) {
+ me._done(true);
} else {
me._recognizeItem();
}
- }).fin(function() {
+ }).finally(function() {
// scroll to this item
- me._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, me._itemTotal-me._items.length-5));
+ me._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, me._itemTotal-me._items.length-4));
}).done();
},
/**
- * Cleans up after items are recognized, disabling the cancel button and making the progress window
- * close on blur
+ * Cleans up after items are recognized, disabling the cancel button and
+ * making the progress window close on blur.
+ * @param {Boolean} cancelled Whether the process was cancelled
*/
- "_done": function() {
+ "_done": function(cancelled) {
this._progressIndicator.value = 100;
- this._progressWindow.document.getElementById("cancel-button").label = Zotero.getString("recognizePDF.close.label");
- var me = this;
- this._progressWindow.addEventListener("blur",
- function() { me._progressWindow.setTimeout(function() { me._progressWindow.close() }, 2000) }, false);
- this._progressWindow.document.getElementById("label").value = Zotero.getString("recognizePDF.complete.label");
+ // Switch out cancel for close
+ var cancelButton = this._progressWindow.document.getElementById("cancel-button"),
+ me = this;
+ cancelButton.label = Zotero.getString("recognizePDF.close.label");
+ cancelButton.removeEventListener("command", this._cancelHandler, false);
+ cancelButton.addEventListener("command", function() { me.close() }, false);
+ this._progressWindow.removeEventListener("keypress", this._keypressCancelHandler);
+ this._progressWindow.addEventListener("keypress", function() { me.close() });
+
+ if(Zotero.isMac) {
+ // On MacOS X, the windows are not always on top, so we hide them on
+ // blur to avoid clutter
+ this._setCloseTimer();
+ }
+ this._progressWindow.document.getElementById("label").value =
+ cancelled ? Zotero.getString("recognizePDF.cancelled.label")
+ : Zotero.getString("recognizePDF.complete.label");
+ },
+
+ /**
+ * Set a timer after which the window will close automatically. If the
+ * window is refocused, clear the timer and do not attempt to auto-close
+ * any more
+ * @private
+ */
+ "_setCloseTimer": function() {
+ var me = this, win = this._progressWindow;
+ var focusListener = function() {
+ if(!win.zoteroCloseTimeoutID) return;
+
+ win.clearTimeout(win.zoteroCloseTimeoutID);
+ delete win.zoteroCloseTimeoutID;
+
+ win.removeEventListener('blur', blurListener, false);
+ win.removeEventListener('focus', focusListener, false);
+ };
+ var blurListener = function() {
+ // Close window after losing focus for 5 seconds
+ win.zoteroCloseTimeoutID = win.setTimeout(function() { win.close() }, 5000);
+ // Prevent auto-close if we gain focus again
+ win.addEventListener("focus", focusListener, false);
+ };
+ win.addEventListener("blur", blurListener, false);
+ },
+
+ /**
+ * Focus items in Zotero library when double-clicking them in the Retrieve
+ * metadata window.
+ * @param {Event} event
+ * @param {tree} tree XUL tree object
+ * @private
+ */
+ "_onDblClick": function(event, tree) {
+ if (event && tree && event.type == "dblclick") {
+ var itemID = this._rowIDs[tree.treeBoxObject.getRowAt(event.clientX, event.clientY)];
+ if(!itemID) return;
+
+ // Get the right window. In tab mode, it's the container window
+ var lastWin = (window.ZoteroTab ? window.ZoteroTab.containerWindow : window);
+
+ if (lastWin.ZoteroOverlay) {
+ lastWin.ZoteroOverlay.toggleDisplay(true);
+ }
+
+ lastWin.ZoteroPane.selectItem(itemID, false, true);
+ lastWin.focus();
+ }
}
- }
+ };
+
+ /**
+ * Singleton for querying Google Scholar. Ensures that all queries are
+ * sequential and respect the delay inbetween queries.
+ * @namespace
+ */
+ this.GSFullTextSearch = new function() {
+ const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // In ms
+ var queryLimitReached = false,
+ inProgress = false,
+ queue = [],
+ stopCheckCallback; // As long as we process one query at a time, this is ok
+ // Load nsICookieManager2
+ Components.utils.import("resource://gre/modules/Services.jsm");
+ var cookieService = Services.cookies;
+
+ /**
+ * Reset "Query Limit Reached" flag, so that we attempt to query Google again
+ */
+ this.resetQueryLimit = function() {
+ queryLimitReached = false;
+ };
+
+ /**
+ * Queue up item for Google Scholar query
+ * @param {String[]} lines Lines of text to use for full-text query
+ * @param {Integer | null} libraryID Library to save the item to
+ * @param {Function} stopCheckCallback Function that returns true if the
+ * process is to be interrupted
+ * @return {Promise} A promise resolved when PDF metadata has been retrieved
+ */
+ this.findItem = function(lines, libraryID, stopCheckCallback) {
+ if(!inProgress && queryLimitReached) {
+ // There's no queue, so we can reject immediately
+ return Q.reject(new Zotero.Exception.Alert("recognizePDF.limit"));
+ }
+
+ var deferred = Q.defer();
+ queue.push({
+ deferred: deferred,
+ lines: lines,
+ libraryID: libraryID,
+ stopCheckCallback: stopCheckCallback
+ });
+ _processQueue();
+ return deferred.promise;
+ };
+
+ /**
+ * Process Google Scholar queue
+ * @private
+ * @param {Boolean} proceed Whether we should pop the next item off the queue
+ * This should not be true unless being called after processing
+ * another item
+ */
+ function _processQueue(proceed) {
+ if(inProgress && !proceed) return; //only one at a time
+
+ if(!queue.length) {
+ inProgress = false;
+ return;
+ }
+
+ inProgress = true;
+ if(queryLimitReached) {
+ // Irreversibly blocked. Reject remaining items in queue
+ var item;
+ while(item = queue.shift()) {
+ item.deferred.reject(new Zotero.Exception.Alert("recognizePDF.limit"));
+ }
+ _processQueue(true); // Wrap it up
+ } else {
+ var item = queue.shift();
+
+ stopCheckCallback = item.stopCheckCallback;
+ if(stopCheckCallback && stopCheckCallback()) {
+ item.deferred.reject(new Zotero.Exception.Alert('recognizePDF.stopped'));
+ _processQueue(true);
+ return;
+ }
+
+ item.deferred.resolve(
+ Q.try(getGoodLines, item.lines)
+ .then(function(lines) {
+ return queryGoogle(lines, item.libraryID, 3); // Try querying 3 times
+ })
+ .finally(function() { _processQueue(true); })
+ );
+ }
+ }
+
+ /**
+ * Select lines that are good candidates for Google Scholar query
+ * @private
+ * @param {String[]} lines
+ * @return {String[]}
+ */
+ function getGoodLines(lines) {
+ // Use only first column from multi-column lines
+ const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/;
+ var cleanedLines = [], cleanedLineLengths = [];
+ for(var i=0; i<lines.length && cleanedLines.length<100; i++) {
+ var m = lineRe.exec(lines[i]);
+ if(m && m[1].split(' ').length > 3) {
+ cleanedLines.push(m[1]);
+ cleanedLineLengths.push(m[1].length);
+ }
+ }
+
+ // Get (not quite) median length
+ var lineLengthsLength = cleanedLineLengths.length;
+ if(lineLengthsLength < 20
+ || cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") {
+ throw new Zotero.Exception.Alert("recognizePDF.noOCR");
+ }
+
+ var sortedLengths = cleanedLineLengths.sort(),
+ medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
+
+ // Pick lines within 6 chars of the median (this is completely arbitrary)
+ var goodLines = [],
+ uBound = medianLength + 6,
+ lBound = medianLength - 6;
+ for (var i=0; i<lineLengthsLength; i++) {
+ if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) {
+ // Strip quotation marks so they don't mess up search query quoting
+ var line = cleanedLines[i].replace('"', '');
+ goodLines.push(line);
+ }
+ }
+ return goodLines;
+ }
+
+ /**
+ * Query Google Scholar
+ * @private
+ * @param {String[]} goodLines
+ * @param {Integer | null} libraryID
+ * @param {Integer} tries Number of queries to attempt before giving up
+ * @return {Promise} A promise resolved when PDF metadata has been retrieved
+ */
+ function queryGoogle(goodLines, libraryID, tries) {
+ if(tries <= 0) throw new Zotero.Exception.Alert("recognizePDF.noMatches");
+
+ // Take the relevant parts of some lines (exclude hyphenated word)
+ var queryString = "", queryStringWords = 0, nextLine = 0;
+ while(queryStringWords < 25) {
+ if(!goodLines.length) throw new Zotero.Exception.Alert("recognizePDF.noMatches");
+
+ var words = goodLines.splice(nextLine, 1)[0].split(/\s+/);
+ // Try to avoid picking adjacent strings so the odds of them appearing in another
+ // document quoting our document is low. Every 7th line is a magic value
+ nextLine = (nextLine + 7) % goodLines.length;
+
+ // Get rid of first and last words
+ words.shift();
+ words.pop();
+ // Make sure there are no long words (probably OCR mistakes)
+ var skipLine = false;
+ for(var i=0; i<words.length; i++) {
+ if(words[i].length > 20) {
+ skipLine = true;
+ break;
+ }
+ }
+ // Add words to query
+ if(!skipLine && words.length) {
+ queryStringWords += words.length;
+ queryString += '"'+words.join(" ")+'" ';
+ }
+ }
+
+ Zotero.debug("RecognizePDF: Query string " + queryString);
+
+ var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search",
+ delay = GOOGLE_SCHOLAR_QUERY_DELAY - (Date.now() - Zotero.HTTP.lastGoogleScholarQueryTime);
+
+ // Delay
+ return (delay > 0 ? Q.delay(delay) : Q())
+ .then(function() {
+ Zotero.HTTP.lastGoogleScholarQueryTime = Date.now();
+ return Zotero.HTTP.promise("GET", url, {"responseType":"document"})
+ })
+ .then(function(xmlhttp) {
+ return _checkCaptchaOK(xmlhttp, 3);
+ },
+ function(e) {
+ return _checkCaptchaError(e, 3);
+ })
+ .then(function(xmlhttp) {
+ var doc = xmlhttp.response,
+ deferred = Q.defer(),
+ translate = new Zotero.Translate.Web();
+
+ translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
+ translate.setDocument(Zotero.HTTP.wrapDocument(doc, url));
+ translate.setHandler("translators", function(translate, detected) {
+ if(detected.length) {
+ deferred.resolve(_promiseTranslate(translate, libraryID));
+ } else {
+ deferred.resolve(Q.try(function() {
+ return queryGoogle(goodLines, libraryID, tries-1);
+ }));
+ }
+ });
+ translate.getTranslators();
+
+ return deferred.promise;
+ })
+ .catch(function(e) {
+ if(e.name == "recognizePDF.limit") {
+ queryLimitReached = true;
+ }
+ throw e;
+ });
+ }
+
+ /**
+ * Check for CAPTCHA on a page with HTTP 200 status
+ * @private
+ * @param {XMLHttpRequest} xmlhttp
+ * @param {Integer} tries Number of queries to attempt before giving up
+ * @return {Promise} A promise resolved when PDF metadata has been retrieved
+ */
+ function _checkCaptchaOK(xmlhttp, tries) {
+ if(stopCheckCallback && stopCheckCallback()) {
+ throw new Zotero.Exception.Alert('recognizePDF.stopped');
+ }
+
+ if(Zotero.Utilities.xpath(xmlhttp.response, "//form[@action='Captcha']").length) {
+ return _solveCaptcha(xmlhttp, tries);
+ }
+ return xmlhttp;
+ }
+
+ /**
+ * Check for CAPTCHA on an error page. Handle 403 and 503 pages
+ * @private
+ * @param {Zotero.HTTP.UnexpectedStatusException} e HTTP response error object
+ * @param {Integer} tries Number of queries to attempt before giving up
+ * @param {Boolean} dontClearCookies Whether to attempt to clear cookies in
+ * in order to get CAPTCHA to show up
+ * @return {Promise} A promise resolved when PDF metadata has been retrieved
+ */
+ function _checkCaptchaError(e, tries, dontClearCookies) {
+ if(stopCheckCallback && stopCheckCallback()) {
+ throw new Zotero.Exception.Alert('recognizePDF.stopped');
+ }
+
+ // Check for captcha on error page
+ if(e instanceof Zotero.HTTP.UnexpectedStatusException
+ && (e.status == 403 || e.status == 503) && e.xmlhttp.response) {
+ if(_extractCaptchaFormData(e.xmlhttp.response)) {
+ return _solveCaptcha(e.xmlhttp, tries);
+ } else if(!dontClearCookies && e.xmlhttp.channel) { // Make sure we can obtain original URL
+ // AFAICT, for 403 errors, GS just says "sorry, try later",
+ // but if you clear cookies, you get a CAPTCHA
+ if(!_clearGSCookies(e.xmlhttp.channel.originalURI.host)) {
+ //user said no or no cookies removed
+ throw new Zotero.Exception.Alert('recognizePDF.limit');
+ }
+ // Redo GET request
+ return Zotero.HTTP.promise("GET", e.xmlhttp.channel.originalURI.spec, {"responseType":"document"})
+ .then(function(xmlhttp) {
+ return _checkCaptchaOK(xmlhttp, tries);
+ },
+ function(e) {
+ return _checkCaptchaError(e, tries, true); // Don't try this again
+ });
+ }
+
+ Zotero.debug("RecognizePDF: Google Scholar returned an unexpected page"
+ + " with status " + e.status);
+ throw new Zotero.Exception.Alert('recognizePDF.limit');
+ }
+ throw e;
+ }
+
+ /**
+ * Prompt user to enter CPATCHA
+ * @private
+ * @param {XMLHttpRequest} xmlhttp
+ * @param {Integer} [tries] Number of queries to attempt before giving up
+ * @return {Promise} A promise resolved when PDF metadata has been retrieved
+ */
+ function _solveCaptcha(xmlhttp, tries) {
+ var doc = xmlhttp.response;
+
+ if(tries === undefined) tries = 3;
+
+ if(!tries) {
+ Zotero.debug("RecognizePDF: Failed to solve CAPTCHA after multiple attempts.");
+ throw new Zotero.Exception.Alert('recognizePDF.limit');
+ }
+
+ tries--;
+ var formData = doc && _extractCaptchaFormData(doc);
+ if(!formData) {
+ Zotero.debug("RecognizePDF: Could not find CAPTCHA on page.");
+ throw new Zotero.Exception.Alert('recognizePDF.limit');
+ }
+
+ var io = { dataIn: {
+ title: Zotero.getString("recognizePDF.captcha.title"),
+ description: Zotero.getString("recognizePDF.captcha.description"),
+ imgUrl: formData.img
+ }};
+
+ _progressWindow.openDialog("chrome://zotero/content/captcha.xul", "",
+ "chrome,modal,resizable=no,centerscreen", io);
+
+ if(!io.dataOut) {
+ Zotero.debug("RecognizePDF: No CAPTCHA entered");
+ throw new Zotero.Exception.Alert('recognizePDF.limit');
+ }
+
+ formData.input.captcha = io.dataOut.captcha;
+ var url = '', prop;
+ for(prop in formData.input) {
+ url += '&' + encodeURIComponent(prop) + '='
+ + encodeURIComponent(formData.input[prop]);
+ }
+
+ url = formData.action + '?' + url.substr(1);
+
+ return Zotero.HTTP.promise("GET", url, {"responseType":"document"})
+ .then(function(xmlhttp) {
+ return _checkCaptchaOK(xmlhttp, tries);
+ },
+ function(e) {
+ return _checkCaptchaError(e, tries);
+ });
+ }
+
+ /**
+ * Extract CAPTCHA form-related data from the CAPTCHA page
+ * @private
+ * @param {Document} doc DOM document object for the CAPTCHA page
+ * @return {Object} Object containing data describing CAPTCHA form
+ */
+ function _extractCaptchaFormData(doc) {
+ var formData = {};
+
+ var img = doc.getElementsByTagName('img')[0];
+ if(!img) return;
+ formData.img = img.src;
+
+ var form = doc.forms[0];
+ if(!form) return;
+
+ formData.action = form.action;
+ formData.input = {};
+ var inputs = form.getElementsByTagName('input');
+ for(var i=0, n=inputs.length; i<n; i++) {
+ if(!inputs[i].name) continue;
+ formData.input[inputs[i].name] = inputs[i].value;
+ }
+
+ formData.continue = "http://scholar.google.com";
+
+ return formData;
+ }
+
+ /**
+ * Clear Google cookies to get the CAPTCHA page to appear
+ * @private
+ * @param {String} host Host of the Google Scholar page (in case it's proxied)
+ * @return {Boolean} Whether any cookies were cleared
+ */
+ function _clearGSCookies(host) {
+ /* There don't seem to be any negative effects of deleting GDSESS
+ if(!Zotero.isStandalone) {
+ //ask user first
+ var response = Components.classes["@mozilla.org/embedcomp/prompt-service;1"]
+ .getService(Components.interfaces.nsIPromptService)
+ .confirm(null, "Clear Google Scholar cookies?",
+ "Google Scholar is attempting to block further queries. We can "
+ + "clear certain cookies and try again. This may affect some "
+ + "temporary Google preferences or it may log you out. May we clear"
+ + " your Google Scholar cookies?");
+ if(!response) return;
+ }*/
+
+ var removed = false, cookies = cookieService.getCookiesFromHost(host);
+ while(cookies.hasMoreElements()) {
+ var cookie = cookies.getNext().QueryInterface(Components.interfaces.nsICookie2);
+ if(["GDSESS", "PREF"].indexOf(cookie.name) !== -1) { // GDSESS doesn't seem to always be enough
+ Zotero.debug("RecognizePDF: Removing cookie " + cookie.name + " for host "
+ + cookie.host + " and path " + cookie.path);
+ cookieService.remove(cookie.host, cookie.name, cookie.path, false);
+ removed = true;
+ }
+ }
+
+ if(!removed) {
+ Zotero.debug("RecognizePDF: No cookies removed");
+ }
+
+ return removed;
+ }
+ };
}
\ No newline at end of file
diff --git a/chrome/locale/en-US/zotero/zotero.dtd b/chrome/locale/en-US/zotero/zotero.dtd
@@ -4,6 +4,8 @@
<!ENTITY zotero.general.deselectAll "Deselect All">
<!ENTITY zotero.general.edit "Edit">
<!ENTITY zotero.general.delete "Delete">
+<!ENTITY zotero.general.ok "OK">
+<!ENTITY zotero.general.cancel "Cancel">
<!ENTITY zotero.errorReport.title "Zotero Error Report">
<!ENTITY zotero.errorReport.unrelatedMessages "The error log may include messages unrelated to Zotero.">
@@ -253,7 +255,6 @@
<!ENTITY zotero.recognizePDF.cancel.label "Cancel">
<!ENTITY zotero.recognizePDF.pdfName.label "PDF Name">
<!ENTITY zotero.recognizePDF.itemName.label "Item Name">
-<!ENTITY zotero.recognizePDF.captcha.label "Type the text below to continue retrieving metadata.">
<!ENTITY zotero.rtfScan.title "RTF Scan">
<!ENTITY zotero.rtfScan.cancel.label "Cancel">
@@ -282,4 +283,4 @@
<!ENTITY zotero.downloadManager.label "Save to Zotero">
<!ENTITY zotero.downloadManager.saveToLibrary.description "Attachments cannot be saved to the currently selected library. This item will be saved to your library instead.">
-<!ENTITY zotero.downloadManager.noPDFTools.description "To use this feature, you must first install the PDF tools in the Search pane of the Zotero preferences.">
+<!ENTITY zotero.downloadManager.noPDFTools.description "To use this feature, you must first install the PDF tools in the Search pane of the Zotero preferences.">
+\ No newline at end of file
diff --git a/chrome/locale/en-US/zotero/zotero.properties b/chrome/locale/en-US/zotero/zotero.properties
@@ -895,12 +895,16 @@ proxies.recognized.add = Add Proxy
recognizePDF.noOCR = PDF does not contain OCRed text.
recognizePDF.couldNotRead = Could not read text from PDF.
-recognizePDF.noMatches = No matching references found.
-recognizePDF.fileNotFound = File not found.
-recognizePDF.limit = Query limit reached. Try again later.
+recognizePDF.noMatches = No matching references found
+recognizePDF.fileNotFound = File not found
+recognizePDF.limit = Google Scholar query limit reached. Try again later.
recognizePDF.error = An unexpected error occurred.
-recognizePDF.complete.label = Metadata Retrieval Complete.
+recognizePDF.stopped = Cancelled
+recognizePDF.complete.label = Metadata Retrieval Complete
+recognizePDF.cancelled.label = Metadata Retrieval Cancelled
recognizePDF.close.label = Close
+recognizePDF.captcha.title = Please enter CAPTCHA
+recognizePDF.captcha.description = Zotero uses Google Scholar to help identify PDFs. To continue using Google Scholar, please enter the text from the image below.
rtfScan.openTitle = Select a file to scan
rtfScan.scanning.label = Scanning RTF Document…
diff --git a/chrome/skin/default/zotero/zotero.css b/chrome/skin/default/zotero/zotero.css
@@ -303,7 +303,6 @@ label.zotero-text-link {
margin-bottom: 1em;
}
-
.zotero-small-progress-indicator {
list-style-image: url(chrome://global/skin/icons/notloading_16.png);
margin-left: -2px;
@@ -316,4 +315,19 @@ label.zotero-text-link {
#zotero-note-window {
padding-bottom: 4px;
+}
+
+#zotero-captcha-description {
+ max-width: 300px;
+ padding-bottom: 4px;
+ text-align: justify;
+}
+
+#zotero-captcha-error {
+ max-width: 300px;
+ padding-bottom: 4px;
+ padding-top: 4px;
+ font-weight: bold;
+ color: red;
+ text-align: center;
}
\ No newline at end of file