www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit b3da19e96df1b0e0f0db49984dc897051db9037c
parent c1c2f6b9decca4c97c5005954f36026b12766b80
Author: aurimasv <aurimas.dev@gmail.com>
Date:   Mon, 30 Dec 2013 23:00:56 -0600

[Retrieve Metadata] Recognize HTTP 503 code as Google Scholar CAPTCHA + other tweaks.
* Stop metadata retrieval when cancelled
* Display CAPTCHA dialog
* Don't close window on blur
* Use Zotero.Utilities.cleanISBN to validate ISBNs

Diffstat:
Achrome/content/zotero/captcha.js | 53+++++++++++++++++++++++++++++++++++++++++++++++++++++
Achrome/content/zotero/captcha.xul | 21+++++++++++++++++++++
Mchrome/content/zotero/recognizePDF.js | 168+++++++++++++++++++++++++++++++++++++++++++++++++++++++++----------------------
Mchrome/locale/en-US/zotero/zotero.dtd | 5+++++
Mchrome/locale/en-US/zotero/zotero.properties | 2+-
5 files changed, 201 insertions(+), 48 deletions(-)

diff --git a/chrome/content/zotero/captcha.js b/chrome/content/zotero/captcha.js @@ -0,0 +1,52 @@ +/* + ***** BEGIN LICENSE BLOCK ***** + + Copyright © 2009 Center for History and New Media + George Mason University, Fairfax, Virginia, USA + http://zotero.org + + This file is part of Zotero. + + Zotero is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Zotero is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with Zotero. If not, see <http://www.gnu.org/licenses/>. + + ***** END LICENSE BLOCK ***** +*/ + +var Zotero_Captcha = new function() { + this._io; + + this.onLoad = function() { + this._io = window.arguments[0]; + document.getElementById('zotero-captcha-image').src = this._io.dataIn.imgUrl; + document.getElementById('zotero-captcha-input').focus(); + } + + this.imageOnLoad = function() { + window.sizeToContent(); + } + + this.resolve = function() { + var result = document.getElementById('zotero-captcha-input'); + if(!result.value) return; + + this._io.dataOut = { + captcha: result.value + }; + window.close(); + } + + this.cancel = function() { + window.close(); + } +} +\ No newline at end of file diff --git a/chrome/content/zotero/captcha.xul b/chrome/content/zotero/captcha.xul @@ -0,0 +1,21 @@ +<?xml version="1.0" ?> +<?xml-stylesheet href="chrome://global/skin/" type="text/css"?> +<!DOCTYPE window SYSTEM "chrome://zotero/locale/zotero.dtd"> + +<window xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul" + title="&zotero.captcha.title;" + onload="Zotero_Captcha.onLoad();" + id="zotero-captcha"> + + <script src="captcha.js"/> + + <vbox style="padding:10px" align="center" flex="1"> + <image id="zotero-captcha-image" onload="Zotero_Captcha.imageOnLoad();" /> + <textbox id="zotero-captcha-input" + onkeypress="if(event.keyCode === KeyEvent.DOM_VK_RETURN) Zotero_Captcha.resolve();" /> + <hbox> + <button label="&zotero.general.ok;" default="true" oncommand="Zotero_Captcha.resolve();" /> + <button label="&zotero.general.cancel;" oncommand="Zotero_Captcha.cancel();" /> + </hbox> + </vbox> +</window> diff --git a/chrome/content/zotero/recognizePDF.js b/chrome/content/zotero/recognizePDF.js @@ -33,7 +33,7 @@ */ var Zotero_RecognizePDF = new function() { Components.utils.import("resource://zotero/q.js"); - var _progressWindow, _progressIndicator; + var _progressWindow, _progressIndicator, itemRecognizer; /** * Checks whether a given PDF could theoretically be recognized @@ -56,7 +56,7 @@ var Zotero_RecognizePDF = new function() { var items = ZoteroPane_Local.getSelectedItems(); if (!items) return; - var itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer(); + itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer(); itemRecognizer.recognizeItems(items); } @@ -70,6 +70,7 @@ var Zotero_RecognizePDF = new function() { this.recognize = function(file, libraryID) { const MAX_PAGES = 7; const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // in ms + var me = this; return _extractText(file, MAX_PAGES).then(function(lines) { // Look for DOI - Use only first 80 lines to avoid catching article references @@ -105,6 +106,9 @@ var Zotero_RecognizePDF = new function() { return promise.fail(function(error) { Zotero.debug("RecognizePDF: "+error); + // Don't try Google Scholar if we already reached query limit + if(itemRecognizer._gsQueryLimitReached) throw new Zotero.Exception.Alert("recognizePDF.limit"); + // Use only first column from multi-column lines const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/; var cleanedLines = [], cleanedLineLengths = []; @@ -141,7 +145,7 @@ var Zotero_RecognizePDF = new function() { var nextLine = 0, limited = false, queryGoogle = function() { - // Once we hit the CAPTCHA once, don't keep trying + // If the users fails (or chooses not) to solve the CAPTCHA, don't keep trying if(limited) throw new Zotero.Exception.Alert("recognizePDF.limit"); // Take the relevant parts of some lines (exclude hyphenated word) @@ -189,9 +193,7 @@ var Zotero_RecognizePDF = new function() { translate = new Zotero.Translate.Web(); if(Zotero.Utilities.xpath(doc, "//form[@action='Captcha']").length) { - // Hit CAPTCHA - limited = true; - throw new Zotero.Exception.Alert("recognizePDF.limit"); + return _solveCaptcha(xmlhttp, 3); } translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); @@ -207,15 +209,25 @@ var Zotero_RecognizePDF = new function() { return deferred.promise; }, function(e) { - if(e instanceof Zotero.HTTP.UnexpectedStatusException && e.status == 403) { - // Hit hard block - throw new Zotero.Exception.Alert("recognizePDF.limit"); + if(e instanceof Zotero.HTTP.UnexpectedStatusException + && (e.status == 403 || e.status == 503)) { + return _solveCaptcha(e.xmlhttp, 3); // Give the user 3 chances to get it right } throw e; }); }; - - return queryGoogle().fail(queryGoogle).fail(queryGoogle); + + var retryCount = 2; + var retryGS = function(e) { + if(!retryCount--) throw e; + // Only retry if we can't find matches + if(e instanceof Zotero.Exception.Alert && e.name == "recognizePDF.noMatches") { + return queryGoogle().catch(retryGS); + } + throw e; + } + + return queryGoogle().catch(retryGS); }); }); } @@ -331,39 +343,87 @@ var Zotero_RecognizePDF = new function() { } // Validate ISBNs - var validIsbns = []; + var validIsbns = [], cleanISBN; for (var i =0; i < isbns.length; i++) { - if(_isValidISBN(isbns[i])) validIsbns.push(isbns[i]); + cleanISBN = Zotero.Utilities.cleanISBN(isbns[i]); + if(cleanISBN) validIsbns.push(cleanISBN); } return validIsbns; } - /** - * Check whether an ISBNs is valid - * @private - * @return {Boolean} - */ - function _isValidISBN(isbn) { - if(isbn.length == 13) { - // ISBN-13 should start with 978 or 979 i.e. GS1 for book publishing industry - var prefix = isbn.slice(0,3); - if (prefix != "978" && prefix != "979") return false; - // Verify check digit - var check = 0; - for (var i = 0; i < 13; i+=2) check += isbn[i]*1; - for (i = 1; i < 12; i+=2) check += 3 * isbn[i]*1; - return (check % 10 == 0); - } else if(isbn.length == 10) { - // Verify ISBN-10 check digit - var check = 0; - for (var i = 0; i < 9; i++) check += isbn[i]*1 * (10-i); - // last number might be 'X' - if (isbn[9] == 'X' || isbn[9] == 'x') check += 10; - else check += isbn[i]*1; - return (check % 11 == 0); + function _extractCaptchaFormData(doc) { + var formData = {}; + + var img = doc.getElementsByTagName('img')[0]; + if(!img) return; + formData.img = img.src; + + var form = doc.forms[0]; + if(!form) return; + + formData.action = form.action; + formData.input = {}; + var inputs = form.getElementsByTagName('input'); + for(var i=0, n=inputs.length; i<n; i++) { + if(!inputs[i].name) continue; + formData.input[inputs[i].name] = inputs[i].value; } - return false; + + formData.continue = "http://scholar.google.com"; + + return formData; } + + function _solveCaptcha(xmlhttp, tries) { + var doc = xmlhttp.response; + + if(tries === undefined) tries = 3; + if(!tries) throw new Zotero_RecognizePDF.CaptchaResult(false); + tries--; + + var formData = doc && _extractCaptchaFormData(doc); + if(!formData) throw new Zotero.Exception.Alert('recognizePDF.limit'); + + var io = { dataIn: { + imgUrl: formData.img + }}; + + _progressWindow.openDialog("chrome://zotero/content/captcha.xul", "", + "chrome,modal,resizable=no,centerscreen", io); + + if(!io.dataOut) { + return Q.reject(new Zotero_RecognizePDF.CaptchaResult(false)); + } + + formData.input.captcha = io.dataOut.captcha; + var url = '', prop; + for(prop in formData.input) { + url += '&' + encodeURIComponent(prop) + '=' + + encodeURIComponent(formData.input[prop]); + } + + url = formData.action + '?' + url.substr(1); + + return Zotero.HTTP.promise("GET", url, {"responseType":"document"}) + .then(function() { + throw new Zotero_RecognizePDF.CaptchaResult(true); + }) + .catch(function(e) { + if(e instanceof Zotero.HTTP.UnexpectedStatusException + && (e.status == 403 || e.status == 503)) { + return _solveCaptcha(e.xmlhttp, tries); + } + throw e; + }); + } + + this.CaptchaResult = function(success) { + this.success = success; + }; + + this.CaptchaResult.prototype.toString = function() { + return this.success ? "CAPTCHA successful" : "CAPTCHA failed"; + }; /** * @class Handles UI, etc. for recognizing multiple items @@ -377,6 +437,7 @@ var Zotero_RecognizePDF = new function() { "_itemsTotal": 0, "_progressWindow": null, "_progressIndicator": null, + "_gsQueryLimitReached": false, /** * Retreives metadata for the PDF items passed, displaying a progress dialog during conversion @@ -388,7 +449,7 @@ var Zotero_RecognizePDF = new function() { this._items = items.slice(); this._itemTotal = items.length; - this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen"); + _progressWindow = this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen"); this._progressWindow.addEventListener("pageshow", function() { me._onWindowLoaded() }, false); }, @@ -427,12 +488,13 @@ var Zotero_RecognizePDF = new function() { } var me = this; - this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator"); + _progressIndicator = this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator"); this._progressWindow.document.getElementById("cancel-button").addEventListener("command", function() { me.stop(); me._progressWindow.close(); }, false); this._progressWindow.addEventListener("close", function() { me.stop() }, false); + this._gsQueryLimitReached = false; // Clear query limit flag this._recognizeItem(); }, @@ -441,6 +503,8 @@ var Zotero_RecognizePDF = new function() { * @private */ "_recognizeItem": function() { + if(this._stopped) return; + Components.utils.import("resource://zotero/q.js"); const SUCCESS_IMAGE = "chrome://zotero/skin/tick.png"; @@ -458,6 +522,7 @@ var Zotero_RecognizePDF = new function() { itemIcon = this._progressWindow.document.getElementById("item-"+item.id+"-icon"), itemTitle = this._progressWindow.document.getElementById("item-"+item.id+"-title"); itemIcon.setAttribute("src", LOADING_IMAGE); + itemTitle.setAttribute("label", ""); var file = item.getFile(), me = this; @@ -487,17 +552,28 @@ var Zotero_RecognizePDF = new function() { me._recognizeItem(); }, function(error) { + if(error instanceof Zotero_RecognizePDF.CaptchaResult && error.success) { + // Redo last item + me._items.unshift(item); + me._recognizeItem(); + return; + } + Zotero.debug(error); Zotero.logError(error); - - itemTitle.setAttribute("label", error instanceof Zotero.Exception.Alert ? error.message : Zotero.getString("recognizePDF.error")); - itemIcon.setAttribute("src", FAILURE_IMAGE); + + if(error instanceof Zotero_RecognizePDF.CaptchaResult && !error.success) { + error = new Zotero.Exception.Alert("recognizePDF.limit"); + } if(error instanceof Zotero.Exception.Alert && error.name === "recognizePDF.limit") { - me._done(); - } else { - me._recognizeItem(); + this._gsQueryLimitReached = true;; } + + itemTitle.setAttribute("label", error instanceof Zotero.Exception.Alert ? error.message : Zotero.getString("recognizePDF.error")); + itemIcon.setAttribute("src", FAILURE_IMAGE); + + me._recognizeItem(); }).fin(function() { // scroll to this item me._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, me._itemTotal-me._items.length-5)); @@ -512,8 +588,6 @@ var Zotero_RecognizePDF = new function() { this._progressIndicator.value = 100; this._progressWindow.document.getElementById("cancel-button").label = Zotero.getString("recognizePDF.close.label"); var me = this; - this._progressWindow.addEventListener("blur", - function() { me._progressWindow.setTimeout(function() { me._progressWindow.close() }, 2000) }, false); this._progressWindow.document.getElementById("label").value = Zotero.getString("recognizePDF.complete.label"); } } diff --git a/chrome/locale/en-US/zotero/zotero.dtd b/chrome/locale/en-US/zotero/zotero.dtd @@ -4,6 +4,8 @@ <!ENTITY zotero.general.deselectAll "Deselect All"> <!ENTITY zotero.general.edit "Edit"> <!ENTITY zotero.general.delete "Delete"> +<!ENTITY zotero.general.ok "OK"> +<!ENTITY zotero.general.cancel "Cancel"> <!ENTITY zotero.errorReport.title "Zotero Error Report"> <!ENTITY zotero.errorReport.unrelatedMessages "The error log may include messages unrelated to Zotero."> @@ -283,3 +285,5 @@ <!ENTITY zotero.downloadManager.label "Save to Zotero"> <!ENTITY zotero.downloadManager.saveToLibrary.description "Attachments cannot be saved to the currently selected library. This item will be saved to your library instead."> <!ENTITY zotero.downloadManager.noPDFTools.description "To use this feature, you must first install the PDF tools in the Search pane of the Zotero preferences."> + +<!ENTITY zotero.captcha.title "Please enter CAPTCHA"> +\ No newline at end of file diff --git a/chrome/locale/en-US/zotero/zotero.properties b/chrome/locale/en-US/zotero/zotero.properties @@ -897,7 +897,7 @@ recognizePDF.noOCR = PDF does not contain OCRed text. recognizePDF.couldNotRead = Could not read text from PDF. recognizePDF.noMatches = No matching references found. recognizePDF.fileNotFound = File not found. -recognizePDF.limit = Query limit reached. Try again later. +recognizePDF.limit = Google Scholar query limit reached. Try again later. recognizePDF.error = An unexpected error occurred. recognizePDF.complete.label = Metadata Retrieval Complete. recognizePDF.close.label = Close