www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 57350fae1eabbc0fdd471645076f731845b7d835
parent 4bedb61aa219b0add29e1c5b8fdc445fd8f918d4
Author: aurimasv <aurimas.dev@gmail.com>
Date:   Tue, 14 Jan 2014 02:17:58 -0600

[Retrieve Metadata] Use a single queue to query Google Scholar. Window closing tweaks.
* Close window on blur after completion on Mac (revert previous change)
* Don't close window when canceling
* Add Esc handler to cancel/close window
* Allow columns to be resized
* Fixes #445
* Fixes #444

Diffstat:
Mchrome/content/zotero/captcha.js | 21+++++++++++++++++++++
Mchrome/content/zotero/captcha.xul | 12+++++++++---
Mchrome/content/zotero/pdfProgress.xul | 2++
Mchrome/content/zotero/recognizePDF.js | 648++++++++++++++++++++++++++++++++++++++++++++++++++-----------------------------
Mchrome/locale/en-US/zotero/zotero.dtd | 7++-----
Mchrome/locale/en-US/zotero/zotero.properties | 10+++++++---
Mchrome/skin/default/zotero/zotero.css | 16+++++++++++++++-
7 files changed, 466 insertions(+), 250 deletions(-)

diff --git a/chrome/content/zotero/captcha.js b/chrome/content/zotero/captcha.js @@ -28,6 +28,27 @@ var Zotero_Captcha = new function() { this.onLoad = function() { this._io = window.arguments[0]; + var description = document.getElementById('zotero-captcha-description'), + errorMsg = document.getElementById('zotero-captcha-error'); + + if(this._io.dataIn.title) { + document.title = this._io.dataIn.title; + } + + if(this._io.dataIn.description) { + description.textContent = this._io.dataIn.description; + description.hidden = false; + } else { + description.hidden = true; + } + + if(this._io.dataIn.error) { + errorMsg.textContent = this._io.dataIn.error; + errorMsg.hidden = false; + } else { + errorMsg.hidden = true; + } + document.getElementById('zotero-captcha-image').src = this._io.dataIn.imgUrl; document.getElementById('zotero-captcha-input').focus(); } diff --git a/chrome/content/zotero/captcha.xul b/chrome/content/zotero/captcha.xul @@ -1,16 +1,22 @@ -<?xml version="1.0" ?> +<?xml version="1.0"?> + <?xml-stylesheet href="chrome://global/skin/" type="text/css"?> +<?xml-stylesheet href="chrome://zotero/skin/zotero.css" type="text/css"?> + <!DOCTYPE window SYSTEM "chrome://zotero/locale/zotero.dtd"> <window xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul" - title="&zotero.captcha.title;" onload="Zotero_Captcha.onLoad();" - id="zotero-captcha"> + id="zotero-captcha" + onkeypress="if(event.keyCode === KeyEvent.DOM_VK_ESCAPE) Zotero_Captcha.cancel();"> + <script src="include.js"/> <script src="captcha.js"/> <vbox style="padding:10px" align="center" flex="1"> + <description id="zotero-captcha-description"></description> <image id="zotero-captcha-image" onload="Zotero_Captcha.imageOnLoad();" /> + <description id="zotero-captcha-error"></description> <textbox id="zotero-captcha-input" onkeypress="if(event.keyCode === KeyEvent.DOM_VK_RETURN) Zotero_Captcha.resolve();" /> <hbox> diff --git a/chrome/content/zotero/pdfProgress.xul b/chrome/content/zotero/pdfProgress.xul @@ -14,7 +14,9 @@ <tree flex="1" id="tree" hidecolumnpicker="true"> <treecols> <treecol id="success-col" style="width:20px;"/> + <splitter class="tree-splitter" hidden="true"/> <treecol label="&zotero.recognizePDF.pdfName.label;" id="pdf-col" flex="1"/> + <splitter class="tree-splitter"/> <treecol label="&zotero.recognizePDF.itemName.label;" id="item-col" flex="2"/> </treecols> <treechildren id="treechildren"/> diff --git a/chrome/content/zotero/recognizePDF.js b/chrome/content/zotero/recognizePDF.js @@ -33,7 +33,7 @@ */ var Zotero_RecognizePDF = new function() { Components.utils.import("resource://zotero/q.js"); - var _progressWindow, _progressIndicator, itemRecognizer; + var _progressWindow, _progressIndicator; /** * Checks whether a given PDF could theoretically be recognized @@ -56,7 +56,7 @@ var Zotero_RecognizePDF = new function() { var items = ZoteroPane_Local.getSelectedItems(); if (!items) return; - itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer(); + var itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer(); itemRecognizer.recognizeItems(items); } @@ -67,9 +67,8 @@ var Zotero_RecognizePDF = new function() { * @param {Integer|null} libraryID The library in which to save the PDF * @return {Promise} A promise resolved when PDF metadata has been retrieved */ - this.recognize = function(file, libraryID) { + this.recognize = function(file, libraryID, stopCheckCallback) { const MAX_PAGES = 7; - const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // in ms var me = this; return _extractText(file, MAX_PAGES).then(function(lines) { @@ -116,129 +115,7 @@ var Zotero_RecognizePDF = new function() { // If no DOI or ISBN, query Google Scholar return promise.fail(function(error) { Zotero.debug("RecognizePDF: "+error); - - // Don't try Google Scholar if we already reached query limit - if(itemRecognizer._gsQueryLimitReached) throw new Zotero.Exception.Alert("recognizePDF.limit"); - - // Use only first column from multi-column lines - const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/; - var cleanedLines = [], cleanedLineLengths = []; - for(var i=0; i<lines.length && cleanedLines.length<100; i++) { - var m = lineRe.exec(lines[i]); - if(m && m[1].split(' ').length > 3) { - cleanedLines.push(m[1]); - cleanedLineLengths.push(m[1].length); - } - } - - // get (not quite) median length - var lineLengthsLength = cleanedLineLengths.length; - if(lineLengthsLength < 20 - || cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") { - throw new Zotero.Exception.Alert("recognizePDF.noOCR"); - } - - var sortedLengths = cleanedLineLengths.sort(), - medianLength = sortedLengths[Math.floor(lineLengthsLength/2)]; - - // pick lines within 6 chars of the median (this is completely arbitrary) - var goodLines = [], - uBound = medianLength + 6, - lBound = medianLength - 6; - for (var i=0; i<lineLengthsLength; i++) { - if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) { - // Strip quotation marks so they don't mess up search query quoting - var line = cleanedLines[i].replace('"', ''); - goodLines.push(line); - } - } - - var nextLine = 0, - limited = false, - queryGoogle = function() { - // If the users fails (or chooses not) to solve the CAPTCHA, don't keep trying - if(limited) throw new Zotero.Exception.Alert("recognizePDF.limit"); - - // Take the relevant parts of some lines (exclude hyphenated word) - var queryString = "", queryStringWords = 0; - while(queryStringWords < 25) { - if(!goodLines.length) throw new Zotero.Exception.Alert("recognizePDF.noMatches"); - - var words = goodLines.splice(nextLine, 1)[0].split(/\s+/); - // Try to avoid picking adjacent strings so the odds of them appearing in another - // document quoting our document is low. Every 7th line is a magic value - nextLine = (nextLine + 7) % goodLines.length; - - // get rid of first and last words - words.shift(); - words.pop(); - // make sure there are no long words (probably OCR mistakes) - var skipLine = false; - for(var i=0; i<words.length; i++) { - if(words[i].length > 20) { - skipLine = true; - break; - } - } - // add words to query - if(!skipLine && words.length) { - queryStringWords += words.length; - queryString += '"'+words.join(" ")+'" '; - } - } - - Zotero.debug("RecognizePDF: Query string "+queryString); - - var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search", - delay = GOOGLE_SCHOLAR_QUERY_DELAY - (Date.now() - Zotero.HTTP.lastGoogleScholarQueryTime); - - // Delay - return (delay > 0 ? Q.delay(delay) : Q.when()) - .then(function() { - Zotero.HTTP.lastGoogleScholarQueryTime = Date.now(); - return Zotero.HTTP.promise("GET", url, {"responseType":"document"}) - }) - .then(function(xmlhttp) { - var doc = xmlhttp.response, - deferred = Q.defer(), - translate = new Zotero.Translate.Web(); - - if(Zotero.Utilities.xpath(doc, "//form[@action='Captcha']").length) { - return _solveCaptcha(xmlhttp, 3); - } - - translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); - translate.setDocument(Zotero.HTTP.wrapDocument(doc, url)); - translate.setHandler("translators", function(translate, detected) { - if(detected.length) { - deferred.resolve(_promiseTranslate(translate, libraryID)); - } else { - deferred.reject(new Zotero.Exception.Alert("recognizePDF.noMatches")); - } - }); - translate.getTranslators(); - - return deferred.promise; - }, function(e) { - if(e instanceof Zotero.HTTP.UnexpectedStatusException - && (e.status == 403 || e.status == 503)) { - return _solveCaptcha(e.xmlhttp, 3); // Give the user 3 chances to get it right - } - throw e; - }); - }; - - var retryCount = 2; - var retryGS = function(e) { - if(!retryCount--) throw e; - // Only retry if we can't find matches - if(e instanceof Zotero.Exception.Alert && e.name == "recognizePDF.noMatches") { - return queryGoogle().catch(retryGS); - } - throw e; - } - - return queryGoogle().catch(retryGS); + return me.GSFullTextSearch.findItem(lines, libraryID, stopCheckCallback); }); }); } @@ -362,80 +239,6 @@ var Zotero_RecognizePDF = new function() { return validIsbns; } - function _extractCaptchaFormData(doc) { - var formData = {}; - - var img = doc.getElementsByTagName('img')[0]; - if(!img) return; - formData.img = img.src; - - var form = doc.forms[0]; - if(!form) return; - - formData.action = form.action; - formData.input = {}; - var inputs = form.getElementsByTagName('input'); - for(var i=0, n=inputs.length; i<n; i++) { - if(!inputs[i].name) continue; - formData.input[inputs[i].name] = inputs[i].value; - } - - formData.continue = "http://scholar.google.com"; - - return formData; - } - - function _solveCaptcha(xmlhttp, tries) { - var doc = xmlhttp.response; - - if(tries === undefined) tries = 3; - if(!tries) throw new Zotero_RecognizePDF.CaptchaResult(false); - tries--; - - var formData = doc && _extractCaptchaFormData(doc); - if(!formData) throw new Zotero.Exception.Alert('recognizePDF.limit'); - - var io = { dataIn: { - imgUrl: formData.img - }}; - - _progressWindow.openDialog("chrome://zotero/content/captcha.xul", "", - "chrome,modal,resizable=no,centerscreen", io); - - if(!io.dataOut) { - return Q.reject(new Zotero_RecognizePDF.CaptchaResult(false)); - } - - formData.input.captcha = io.dataOut.captcha; - var url = '', prop; - for(prop in formData.input) { - url += '&' + encodeURIComponent(prop) + '=' - + encodeURIComponent(formData.input[prop]); - } - - url = formData.action + '?' + url.substr(1); - - return Zotero.HTTP.promise("GET", url, {"responseType":"document"}) - .then(function() { - throw new Zotero_RecognizePDF.CaptchaResult(true); - }) - .catch(function(e) { - if(e instanceof Zotero.HTTP.UnexpectedStatusException - && (e.status == 403 || e.status == 503)) { - return _solveCaptcha(e.xmlhttp, tries); - } - throw e; - }); - } - - this.CaptchaResult = function(success) { - this.success = success; - }; - - this.CaptchaResult.prototype.toString = function() { - return this.success ? "CAPTCHA successful" : "CAPTCHA failed"; - }; - /** * @class Handles UI, etc. for recognizing multiple items */ @@ -448,7 +251,6 @@ var Zotero_RecognizePDF = new function() { "_itemsTotal": 0, "_progressWindow": null, "_progressIndicator": null, - "_gsQueryLimitReached": false, /** * Retreives metadata for the PDF items passed, displaying a progress dialog during conversion @@ -470,7 +272,12 @@ var Zotero_RecognizePDF = new function() { "stop": function() { this._stopped = true; }, - + + "close": function() { + this.stop(); + this._progressWindow.close(); + }, + /** * Called when the progress window has been opened; adds items to the tree and begins recognizing * @param @@ -499,13 +306,18 @@ var Zotero_RecognizePDF = new function() { } var me = this; + this._cancelHandler = function() { me.stop() }; + this._keypressCancelHandler = function(e) { + if(e.keyCode === KeyEvent.DOM_VK_ESCAPE) me.stop(); + }; + _progressIndicator = this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator"); - this._progressWindow.document.getElementById("cancel-button").addEventListener("command", function() { - me.stop(); - me._progressWindow.close(); - }, false); - this._progressWindow.addEventListener("close", function() { me.stop() }, false); - this._gsQueryLimitReached = false; // Clear query limit flag + this._progressWindow.document.getElementById("cancel-button") + .addEventListener("command", this._cancelHandler, false); + // Also cancel if the user presses Esc + this._progressWindow.addEventListener("keypress", this._keypressCancelHandler); + this._progressWindow.addEventListener("close", this._cancelHandler, false); + Zotero_RecognizePDF.GSFullTextSearch.resetQueryLimit(); this._recognizeItem(); }, @@ -514,8 +326,6 @@ var Zotero_RecognizePDF = new function() { * @private */ "_recognizeItem": function() { - if(this._stopped) return; - Components.utils.import("resource://zotero/q.js"); const SUCCESS_IMAGE = "chrome://zotero/skin/tick.png"; @@ -527,6 +337,12 @@ var Zotero_RecognizePDF = new function() { return; } + // Order here matters. Otherwise we may show an incorrect label + if(this._stopped) { + this._done(true); + return; + } + this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100; var item = this._items.shift(), @@ -538,13 +354,13 @@ var Zotero_RecognizePDF = new function() { var file = item.getFile(), me = this; (file - ? Zotero_RecognizePDF.recognize(file, item.libraryID) + ? Zotero_RecognizePDF.recognize(file, item.libraryID, function() { return me._stopped; }) : Q.reject(new Zotero.Exception.Alert("recognizePDF.fileNotFound"))) .then(function(newItem) { // If already stopped, delete if(me._stopped) { - Zotero.Items.erase(item.id); - return; + Zotero.Items.erase(newItem.id); + throw new Zotero.Exception.Alert('recognizePDF.stopped'); } // put new item in same collections as the old one @@ -562,32 +378,23 @@ var Zotero_RecognizePDF = new function() { itemIcon.setAttribute("src", SUCCESS_IMAGE); me._recognizeItem(); - }, function(error) { - if(error instanceof Zotero_RecognizePDF.CaptchaResult && error.success) { - // Redo last item - me._items.unshift(item); - me._recognizeItem(); - return; - } - + }) + .catch(function(error) { Zotero.debug(error); Zotero.logError(error); - if(error instanceof Zotero_RecognizePDF.CaptchaResult && !error.success) { - error = new Zotero.Exception.Alert("recognizePDF.limit"); - } - - if(error instanceof Zotero.Exception.Alert && error.name === "recognizePDF.limit") { - this._gsQueryLimitReached = true;; - } - itemTitle.setAttribute("label", error instanceof Zotero.Exception.Alert ? error.message : Zotero.getString("recognizePDF.error")); itemIcon.setAttribute("src", FAILURE_IMAGE); - me._recognizeItem(); - }).fin(function() { + // Don't show "completed" label if stopped on last item + if(me._stopped && !me._items.length) { + me._done(true); + } else { + me._recognizeItem(); + } + }).finally(function() { // scroll to this item - me._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, me._itemTotal-me._items.length-5)); + me._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, me._itemTotal-me._items.length-4)); }).done(); }, @@ -595,11 +402,376 @@ var Zotero_RecognizePDF = new function() { * Cleans up after items are recognized, disabling the cancel button and making the progress window * close on blur */ - "_done": function() { + "_done": function(cancelled) { this._progressIndicator.value = 100; - this._progressWindow.document.getElementById("cancel-button").label = Zotero.getString("recognizePDF.close.label"); - var me = this; - this._progressWindow.document.getElementById("label").value = Zotero.getString("recognizePDF.complete.label"); + // Switch out cancel for close + var cancelButton = this._progressWindow.document.getElementById("cancel-button"), + me = this; + cancelButton.label = Zotero.getString("recognizePDF.close.label"); + cancelButton.removeEventListener("command", this._cancelHandler, false); + cancelButton.addEventListener("command", function() { me.close() }, false); + this._progressWindow.removeEventListener("keypress", this._keypressCancelHandler); + this._progressWindow.addEventListener("keypress", function() { me.close() }); + + if(Zotero.isMac) { + //on MacOS X, the windows are not always on top, so we hide them on blur + // to avoid clutter + this._setCloseTimer(); + } + this._progressWindow.document.getElementById("label").value = + cancelled ? Zotero.getString("recognizePDF.cancelled.label") + : Zotero.getString("recognizePDF.complete.label"); + }, + + "_setCloseTimer": function() { + var me = this, win = this._progressWindow; + var focusListener = function() { + if(!win.zoteroCloseTimeoutID) return; + + win.clearTimeout(win.zoteroCloseTimeoutID); + delete win.zoteroCloseTimeoutID; + + win.removeEventListener('blur', blurListener, false); + win.removeEventListener('focus', focusListener, false); + }; + var blurListener = function() { + //close window after losing focus for 5 seconds + win.zoteroCloseTimeoutID = win.setTimeout(function() { win.close() }, 5000); + //re-set timer if we gain focus again + win.addEventListener("focus", focusListener, false); + }; + win.addEventListener("blur", blurListener, false); } - } + }; + + this.GSFullTextSearch = new function() { + const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // in ms + var queryLimitReached = false, + inProgress = false, + queue = [], + stopCheckCallback; // As long as we process one query at a time, this is ok + //load nsICookieManager2 + Components.utils.import("resource://gre/modules/Services.jsm"); + var cookieService = Services.cookies; + + this.resetQueryLimit = function() { + queryLimitReached = false; + }; + + this.findItem = function(lines, libraryID, stopCheckCallback) { + if(!inProgress && queryLimitReached) { + //there's no queue, so we can reject immediately + return Q.reject(new Zotero.Exception.Alert("recognizePDF.limit")); + } + + var deferred = Q.defer(); + queue.push({ + deferred: deferred, + lines: lines, + libraryID: libraryID, + stopCheckCallback: stopCheckCallback + }); + _processQueue(); + return deferred.promise; + }; + + function _processQueue(proceed) { + if(inProgress && !proceed) return; //only one at a time + + if(!queue.length) { + inProgress = false; + return; + } + + inProgress = true; + if(queryLimitReached) { + //irreversibly blocked. Reject remaining items in queue + var item; + while(item = queue.shift()) { + item.deferred.reject(new Zotero.Exception.Alert("recognizePDF.limit")); + } + _processQueue(true); //wrap it up + } else { + var item = queue.shift(); + + stopCheckCallback = item.stopCheckCallback; + if(stopCheckCallback && stopCheckCallback()) { + item.deferred.reject(new Zotero.Exception.Alert('recognizePDF.stopped')); + _processQueue(true); + return; + } + + item.deferred.resolve( + Q.try(getGoodLines, item.lines) + .then(function(lines) { + return queryGoogle(lines, item.libraryID, 3); //try querying 3 times + }) + .finally(function() { _processQueue(true); }) + ); + } + } + + function getGoodLines(lines) { + // Use only first column from multi-column lines + const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/; + var cleanedLines = [], cleanedLineLengths = []; + for(var i=0; i<lines.length && cleanedLines.length<100; i++) { + var m = lineRe.exec(lines[i]); + if(m && m[1].split(' ').length > 3) { + cleanedLines.push(m[1]); + cleanedLineLengths.push(m[1].length); + } + } + + // get (not quite) median length + var lineLengthsLength = cleanedLineLengths.length; + if(lineLengthsLength < 20 + || cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") { + throw new Zotero.Exception.Alert("recognizePDF.noOCR"); + } + + var sortedLengths = cleanedLineLengths.sort(), + medianLength = sortedLengths[Math.floor(lineLengthsLength/2)]; + + // pick lines within 6 chars of the median (this is completely arbitrary) + var goodLines = [], + uBound = medianLength + 6, + lBound = medianLength - 6; + for (var i=0; i<lineLengthsLength; i++) { + if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) { + // Strip quotation marks so they don't mess up search query quoting + var line = cleanedLines[i].replace('"', ''); + goodLines.push(line); + } + } + return goodLines; + } + + function queryGoogle(goodLines, libraryID, tries) { + if(tries <= 0) throw new Zotero.Exception.Alert("recognizePDF.noMatches"); + + // Take the relevant parts of some lines (exclude hyphenated word) + var queryString = "", queryStringWords = 0, nextLine = 0; + while(queryStringWords < 25) { + if(!goodLines.length) throw new Zotero.Exception.Alert("recognizePDF.noMatches"); + + var words = goodLines.splice(nextLine, 1)[0].split(/\s+/); + // Try to avoid picking adjacent strings so the odds of them appearing in another + // document quoting our document is low. Every 7th line is a magic value + nextLine = (nextLine + 7) % goodLines.length; + + // get rid of first and last words + words.shift(); + words.pop(); + // make sure there are no long words (probably OCR mistakes) + var skipLine = false; + for(var i=0; i<words.length; i++) { + if(words[i].length > 20) { + skipLine = true; + break; + } + } + // add words to query + if(!skipLine && words.length) { + queryStringWords += words.length; + queryString += '"'+words.join(" ")+'" '; + } + } + + Zotero.debug("RecognizePDF: Query string " + queryString); + + var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search", + delay = GOOGLE_SCHOLAR_QUERY_DELAY - (Date.now() - Zotero.HTTP.lastGoogleScholarQueryTime); + + // Delay + return (delay > 0 ? Q.delay(delay) : Q()) + .then(function() { + Zotero.HTTP.lastGoogleScholarQueryTime = Date.now(); + return Zotero.HTTP.promise("GET", url, {"responseType":"document"}) + }) + .then(function(xmlhttp) { + return _checkCaptchaOK(xmlhttp, 3); + }, + function(e) { + return _checkCaptchaError(e, 3); + }) + .then(function(xmlhttp) { + var doc = xmlhttp.response, + deferred = Q.defer(), + translate = new Zotero.Translate.Web(); + + translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); + translate.setDocument(Zotero.HTTP.wrapDocument(doc, url)); + translate.setHandler("translators", function(translate, detected) { + if(detected.length) { + deferred.resolve(_promiseTranslate(translate, libraryID)); + } else { + deferred.resolve(Q.try(function() { + return queryGoogle(goodLines, libraryID, tries-1); + })); + } + }); + translate.getTranslators(); + + return deferred.promise; + }) + .catch(function(e) { + if(e.name == "recognizePDF.limit") { + queryLimitReached = true; + } + throw e; + }); + } + + function _checkCaptchaOK(xmlhttp, tries) { + if(stopCheckCallback && stopCheckCallback()) { + throw new Zotero.Exception.Alert('recognizePDF.stopped'); + } + + //check for captcha on page with HTTP 200 status + if(Zotero.Utilities.xpath(xmlhttp.response, "//form[@action='Captcha']").length) { + return _solveCaptcha(xmlhttp, tries); + } + return xmlhttp; + } + + function _checkCaptchaError(e, tries, dontClearCookies) { + if(stopCheckCallback && stopCheckCallback()) { + throw new Zotero.Exception.Alert('recognizePDF.stopped'); + } + + //check for captcha on error page + if(e instanceof Zotero.HTTP.UnexpectedStatusException + && (e.status == 403 || e.status == 503) && e.xmlhttp.response) { + if(_extractCaptchaFormData(e.xmlhttp.response)) { + return _solveCaptcha(e.xmlhttp, tries); + } else if(!dontClearCookies && e.xmlhttp.channel) { //make sure we can obtain original URL + //AFAICT, for 403 errors, GS just says "sorry, try later", + // but if you clear cookies, you get a captcha + if(!_clearGSCookies(e.xmlhttp.channel.originalURI.host)) { + //user said no or no cookies removed + throw new Zotero.Exception.Alert('recognizePDF.limit'); + } + //redo GET request + return Zotero.HTTP.promise("GET", e.xmlhttp.channel.originalURI.spec, {"responseType":"document"}) + .then(function(xmlhttp) { + return _checkCaptchaOK(xmlhttp, tries, true); //don't try this again + }, + function(e) { + return _checkCaptchaError(e, tries, true); //don't try this again + }); + } + + Zotero.debug("RecognizePDF: Google Scholar returned an unexpected page" + + " with status " + e.status); + throw new Zotero.Exception.Alert('recognizePDF.limit'); + } + throw e; + } + + function _solveCaptcha(xmlhttp, tries) { + var doc = xmlhttp.response; + + if(tries === undefined) tries = 3; + + if(!tries) { + Zotero.debug("RecognizePDF: Failed to solve CAPTCHA after multiple attempts."); + throw new Zotero.Exception.Alert('recognizePDF.limit'); + } + + tries--; + var formData = doc && _extractCaptchaFormData(doc); + if(!formData) { + Zotero.debug("RecognizePDF: Could not find CAPTCHA on page."); + throw new Zotero.Exception.Alert('recognizePDF.limit'); + } + + var io = { dataIn: { + title: Zotero.getString("recognizePDF.captcha.title"), + description: Zotero.getString("recognizePDF.captcha.description"), + imgUrl: formData.img + }}; + + _progressWindow.openDialog("chrome://zotero/content/captcha.xul", "", + "chrome,modal,resizable=no,centerscreen", io); + + if(!io.dataOut) { + Zotero.debug("RecognizePDF: No CAPTCHA entered"); + throw new Zotero.Exception.Alert('recognizePDF.limit'); + } + + formData.input.captcha = io.dataOut.captcha; + var url = '', prop; + for(prop in formData.input) { + url += '&' + encodeURIComponent(prop) + '=' + + encodeURIComponent(formData.input[prop]); + } + + url = formData.action + '?' + url.substr(1); + + return Zotero.HTTP.promise("GET", url, {"responseType":"document"}) + .then(function(xmlhttp) { + return _checkCaptchaOK(xmlhttp, tries); + }, + function(e) { + return _checkCaptchaError(e, tries); + }); + } + + function _extractCaptchaFormData(doc) { + var formData = {}; + + var img = doc.getElementsByTagName('img')[0]; + if(!img) return; + formData.img = img.src; + + var form = doc.forms[0]; + if(!form) return; + + formData.action = form.action; + formData.input = {}; + var inputs = form.getElementsByTagName('input'); + for(var i=0, n=inputs.length; i<n; i++) { + if(!inputs[i].name) continue; + formData.input[inputs[i].name] = inputs[i].value; + } + + formData.continue = "http://scholar.google.com"; + + return formData; + } + + function _clearGSCookies(host) { + /* There don't seem to be any negative effects of deleting GDSESS + if(!Zotero.isStandalone) { + //ask user first + var response = Components.classes["@mozilla.org/embedcomp/prompt-service;1"] + .getService(Components.interfaces.nsIPromptService) + .confirm(null, "Clear Google Scholar cookies?", + "Google Scholar is attempting to block further queries. We can " + + "clear certain cookies and try again. This may affect some " + + "temporary Google preferences or it may log you out. May we clear" + + " your Google Scholar cookies?"); + if(!response) return; + }*/ + + //find GDSESS cookie + var removed = false, cookies = cookieService.getCookiesFromHost(host); + while(cookies.hasMoreElements()) { + var cookie = cookies.getNext().QueryInterface(Components.interfaces.nsICookie2); + if(["GDSESS", "PREF"].indexOf(cookie.name) !== -1) { + Zotero.debug("RecognizePDF: Removing cookie " + cookie.name + " for host " + + cookie.host + " and path " + cookie.path); + cookieService.remove(cookie.host, cookie.name, cookie.path, false); + removed = true; + } + } + + if(!removed) { + Zotero.debug("RecognizePDF: No cookies removed"); + } + + return removed; + } + }; } \ No newline at end of file diff --git a/chrome/locale/en-US/zotero/zotero.dtd b/chrome/locale/en-US/zotero/zotero.dtd @@ -255,7 +255,6 @@ <!ENTITY zotero.recognizePDF.cancel.label "Cancel"> <!ENTITY zotero.recognizePDF.pdfName.label "PDF Name"> <!ENTITY zotero.recognizePDF.itemName.label "Item Name"> -<!ENTITY zotero.recognizePDF.captcha.label "Type the text below to continue retrieving metadata."> <!ENTITY zotero.rtfScan.title "RTF Scan"> <!ENTITY zotero.rtfScan.cancel.label "Cancel"> @@ -284,6 +283,4 @@ <!ENTITY zotero.downloadManager.label "Save to Zotero"> <!ENTITY zotero.downloadManager.saveToLibrary.description "Attachments cannot be saved to the currently selected library. This item will be saved to your library instead."> -<!ENTITY zotero.downloadManager.noPDFTools.description "To use this feature, you must first install the PDF tools in the Search pane of the Zotero preferences."> - -<!ENTITY zotero.captcha.title "Please enter CAPTCHA"> -\ No newline at end of file +<!ENTITY zotero.downloadManager.noPDFTools.description "To use this feature, you must first install the PDF tools in the Search pane of the Zotero preferences."> +\ No newline at end of file diff --git a/chrome/locale/en-US/zotero/zotero.properties b/chrome/locale/en-US/zotero/zotero.properties @@ -895,12 +895,16 @@ proxies.recognized.add = Add Proxy recognizePDF.noOCR = PDF does not contain OCRed text. recognizePDF.couldNotRead = Could not read text from PDF. -recognizePDF.noMatches = No matching references found. -recognizePDF.fileNotFound = File not found. +recognizePDF.noMatches = No matching references found +recognizePDF.fileNotFound = File not found recognizePDF.limit = Google Scholar query limit reached. Try again later. recognizePDF.error = An unexpected error occurred. -recognizePDF.complete.label = Metadata Retrieval Complete. +recognizePDF.stopped = Cancelled +recognizePDF.complete.label = Metadata Retrieval Complete +recognizePDF.cancelled.label = Metadata Retrieval Cancelled recognizePDF.close.label = Close +recognizePDF.captcha.title = Please enter CAPTCHA +recognizePDF.captcha.description = Zotero uses Google Scholar to help identify PDFs. To continue using Google Scholar, please enter the text from the image below. rtfScan.openTitle = Select a file to scan rtfScan.scanning.label = Scanning RTF Document… diff --git a/chrome/skin/default/zotero/zotero.css b/chrome/skin/default/zotero/zotero.css @@ -303,7 +303,6 @@ label.zotero-text-link { margin-bottom: 1em; } - .zotero-small-progress-indicator { list-style-image: url(chrome://global/skin/icons/notloading_16.png); margin-left: -2px; @@ -316,4 +315,19 @@ label.zotero-text-link { #zotero-note-window { padding-bottom: 4px; +} + +#zotero-captcha-description { + max-width: 300px; + padding-bottom: 4px; + text-align: justify; +} + +#zotero-captcha-error { + max-width: 300px; + padding-bottom: 4px; + padding-top: 4px; + font-weight: bold; + color: red; + text-align: center; } \ No newline at end of file