www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 71a37511797b232a58de4a699a9d7df8a8286cbd
parent 121b75ef6c3fbddf72bc20cdd88a20fbea97ecc5
Author: Simon Kornblith <simon@simonster.com>
Date:   Fri, 31 Jan 2014 00:16:09 -0800

Merge pull request #433 from aurimasv/retrieve-meta

Retrieve Metadata query limit fixes
Diffstat:
Achrome/content/zotero/captcha.js | 74++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Achrome/content/zotero/captcha.xul | 27+++++++++++++++++++++++++++
Mchrome/content/zotero/pdfProgress.xul | 2++
Mchrome/content/zotero/recognizePDF.js | 711++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-------------------
Mchrome/locale/en-US/zotero/zotero.dtd | 6++++--
Mchrome/locale/en-US/zotero/zotero.properties | 12++++++++----
Mchrome/skin/default/zotero/zotero.css | 16+++++++++++++++-
7 files changed, 671 insertions(+), 177 deletions(-)

diff --git a/chrome/content/zotero/captcha.js b/chrome/content/zotero/captcha.js @@ -0,0 +1,73 @@ +/* + ***** BEGIN LICENSE BLOCK ***** + + Copyright © 2009 Center for History and New Media + George Mason University, Fairfax, Virginia, USA + http://zotero.org + + This file is part of Zotero. + + Zotero is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Zotero is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with Zotero. If not, see <http://www.gnu.org/licenses/>. + + ***** END LICENSE BLOCK ***** +*/ + +var Zotero_Captcha = new function() { + this._io; + + this.onLoad = function() { + this._io = window.arguments[0]; + var description = document.getElementById('zotero-captcha-description'), + errorMsg = document.getElementById('zotero-captcha-error'); + + if(this._io.dataIn.title) { + document.title = this._io.dataIn.title; + } + + if(this._io.dataIn.description) { + description.textContent = this._io.dataIn.description; + description.hidden = false; + } else { + description.hidden = true; + } + + if(this._io.dataIn.error) { + errorMsg.textContent = this._io.dataIn.error; + errorMsg.hidden = false; + } else { + errorMsg.hidden = true; + } + + document.getElementById('zotero-captcha-image').src = this._io.dataIn.imgUrl; + document.getElementById('zotero-captcha-input').focus(); + } + + this.imageOnLoad = function() { + window.sizeToContent(); + } + + this.resolve = function() { + var result = document.getElementById('zotero-captcha-input'); + if(!result.value) return; + + this._io.dataOut = { + captcha: result.value + }; + window.close(); + } + + this.cancel = function() { + window.close(); + } +} +\ No newline at end of file diff --git a/chrome/content/zotero/captcha.xul b/chrome/content/zotero/captcha.xul @@ -0,0 +1,27 @@ +<?xml version="1.0"?> + +<?xml-stylesheet href="chrome://global/skin/" type="text/css"?> +<?xml-stylesheet href="chrome://zotero/skin/zotero.css" type="text/css"?> + +<!DOCTYPE window SYSTEM "chrome://zotero/locale/zotero.dtd"> + +<window xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul" + onload="Zotero_Captcha.onLoad();" + id="zotero-captcha" + onkeypress="if(event.keyCode === KeyEvent.DOM_VK_ESCAPE) Zotero_Captcha.cancel();"> + + <script src="include.js"/> + <script src="captcha.js"/> + + <vbox style="padding:10px" align="center" flex="1"> + <description id="zotero-captcha-description"></description> + <image id="zotero-captcha-image" onload="Zotero_Captcha.imageOnLoad();" /> + <description id="zotero-captcha-error"></description> + <textbox id="zotero-captcha-input" + onkeypress="if(event.keyCode === KeyEvent.DOM_VK_RETURN) Zotero_Captcha.resolve();" /> + <hbox> + <button label="&zotero.general.ok;" default="true" oncommand="Zotero_Captcha.resolve();" /> + <button label="&zotero.general.cancel;" oncommand="Zotero_Captcha.cancel();" /> + </hbox> + </vbox> +</window> diff --git a/chrome/content/zotero/pdfProgress.xul b/chrome/content/zotero/pdfProgress.xul @@ -14,7 +14,9 @@ <tree flex="1" id="tree" hidecolumnpicker="true"> <treecols> <treecol id="success-col" style="width:20px;"/> + <splitter class="tree-splitter" hidden="true"/> <treecol label="&zotero.recognizePDF.pdfName.label;" id="pdf-col" flex="1"/> + <splitter class="tree-splitter"/> <treecol label="&zotero.recognizePDF.itemName.label;" id="item-col" flex="2"/> </treecols> <treechildren id="treechildren"/> diff --git a/chrome/content/zotero/recognizePDF.js b/chrome/content/zotero/recognizePDF.js @@ -65,19 +65,32 @@ var Zotero_RecognizePDF = new function() { * * @param {nsIFile} file The PDF file to retrieve metadata for * @param {Integer|null} libraryID The library in which to save the PDF + * @param {Function} stopCheckCallback Function that returns true if the + * process is to be interrupted * @return {Promise} A promise resolved when PDF metadata has been retrieved */ - this.recognize = function(file, libraryID) { + this.recognize = function(file, libraryID, stopCheckCallback) { const MAX_PAGES = 7; - const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // in ms + var me = this; return _extractText(file, MAX_PAGES).then(function(lines) { // Look for DOI - Use only first 80 lines to avoid catching article references var allText = lines.join("\n"), - doi = Zotero.Utilities.cleanDOI(lines.slice(0,80).join('\n')), + firstChunk = lines.slice(0,80).join('\n'), + doi = Zotero.Utilities.cleanDOI(firstChunk), promise; Zotero.debug(allText); + if(!doi) { + // Look for a JSTOR stable URL, which can be converted to a DOI by prepending 10.2307 + doi = firstChunk.match(/www.\jstor\.org\/stable\/(\S+)/i); + if(doi) { + doi = Zotero.Utilities.cleanDOI( + doi[1].indexOf('10.') == 0 ? doi[1] : '10.2307/' + doi[1] + ); + } + } + if(doi) { // Look up DOI Zotero.debug("RecognizePDF: Found DOI: "+doi); @@ -104,118 +117,7 @@ var Zotero_RecognizePDF = new function() { // If no DOI or ISBN, query Google Scholar return promise.fail(function(error) { Zotero.debug("RecognizePDF: "+error); - - // Use only first column from multi-column lines - const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/; - var cleanedLines = [], cleanedLineLengths = []; - for(var i=0; i<lines.length && cleanedLines.length<100; i++) { - var m = lineRe.exec(lines[i]); - if(m && m[1].split(' ').length > 3) { - cleanedLines.push(m[1]); - cleanedLineLengths.push(m[1].length); - } - } - - // get (not quite) median length - var lineLengthsLength = cleanedLineLengths.length; - if(lineLengthsLength < 20 - || cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") { - throw new Zotero.Exception.Alert("recognizePDF.noOCR"); - } - - var sortedLengths = cleanedLineLengths.sort(), - medianLength = sortedLengths[Math.floor(lineLengthsLength/2)]; - - // pick lines within 6 chars of the median (this is completely arbitrary) - var goodLines = [], - uBound = medianLength + 6, - lBound = medianLength - 6; - for (var i=0; i<lineLengthsLength; i++) { - if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) { - // Strip quotation marks so they don't mess up search query quoting - var line = cleanedLines[i].replace('"', ''); - goodLines.push(line); - } - } - - var nextLine = 0, - limited = false, - queryGoogle = function() { - // Once we hit the CAPTCHA once, don't keep trying - if(limited) throw new Zotero.Exception.Alert("recognizePDF.limit"); - - // Take the relevant parts of some lines (exclude hyphenated word) - var queryString = "", queryStringWords = 0; - while(queryStringWords < 25) { - if(!goodLines.length) throw new Zotero.Exception.Alert("recognizePDF.noMatches"); - - var words = goodLines.splice(nextLine, 1)[0].split(/\s+/); - // Try to avoid picking adjacent strings so the odds of them appearing in another - // document quoting our document is low. Every 7th line is a magic value - nextLine = (nextLine + 7) % goodLines.length; - - // get rid of first and last words - words.shift(); - words.pop(); - // make sure there are no long words (probably OCR mistakes) - var skipLine = false; - for(var i=0; i<words.length; i++) { - if(words[i].length > 20) { - skipLine = true; - break; - } - } - // add words to query - if(!skipLine && words.length) { - queryStringWords += words.length; - queryString += '"'+words.join(" ")+'" '; - } - } - - Zotero.debug("RecognizePDF: Query string "+queryString); - - var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search", - delay = GOOGLE_SCHOLAR_QUERY_DELAY - (Date.now() - Zotero.HTTP.lastGoogleScholarQueryTime); - - // Delay - return (delay > 0 ? Q.delay(delay) : Q.when()) - .then(function() { - Zotero.HTTP.lastGoogleScholarQueryTime = Date.now(); - return Zotero.HTTP.promise("GET", url, {"responseType":"document"}) - }) - .then(function(xmlhttp) { - var doc = xmlhttp.response, - deferred = Q.defer(), - translate = new Zotero.Translate.Web(); - - if(Zotero.Utilities.xpath(doc, "//form[@action='Captcha']").length) { - // Hit CAPTCHA - limited = true; - throw new Zotero.Exception.Alert("recognizePDF.limit"); - } - - translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); - translate.setDocument(Zotero.HTTP.wrapDocument(doc, url)); - translate.setHandler("translators", function(translate, detected) { - if(detected.length) { - deferred.resolve(_promiseTranslate(translate, libraryID)); - } else { - deferred.reject(new Zotero.Exception.Alert("recognizePDF.noMatches")); - } - }); - translate.getTranslators(); - - return deferred.promise; - }, function(e) { - if(e instanceof Zotero.HTTP.UnexpectedStatusException && e.status == 403) { - // Hit hard block - throw new Zotero.Exception.Alert("recognizePDF.limit"); - } - throw e; - }); - }; - - return queryGoogle().fail(queryGoogle).fail(queryGoogle); + return me.GSFullTextSearch.findItem(lines, libraryID, stopCheckCallback); }); }); } @@ -331,41 +233,15 @@ var Zotero_RecognizePDF = new function() { } // Validate ISBNs - var validIsbns = []; + var validIsbns = [], cleanISBN; for (var i =0; i < isbns.length; i++) { - if(_isValidISBN(isbns[i])) validIsbns.push(isbns[i]); + cleanISBN = Zotero.Utilities.cleanISBN(isbns[i]); + if(cleanISBN) validIsbns.push(cleanISBN); } return validIsbns; } /** - * Check whether an ISBNs is valid - * @private - * @return {Boolean} - */ - function _isValidISBN(isbn) { - if(isbn.length == 13) { - // ISBN-13 should start with 978 or 979 i.e. GS1 for book publishing industry - var prefix = isbn.slice(0,3); - if (prefix != "978" && prefix != "979") return false; - // Verify check digit - var check = 0; - for (var i = 0; i < 13; i+=2) check += isbn[i]*1; - for (i = 1; i < 12; i+=2) check += 3 * isbn[i]*1; - return (check % 10 == 0); - } else if(isbn.length == 10) { - // Verify ISBN-10 check digit - var check = 0; - for (var i = 0; i < 9; i++) check += isbn[i]*1 * (10-i); - // last number might be 'X' - if (isbn[9] == 'X' || isbn[9] == 'x') check += 10; - else check += isbn[i]*1; - return (check % 11 == 0); - } - return false; - } - - /** * @class Handles UI, etc. for recognizing multiple items */ this.ItemRecognizer = function () { @@ -388,7 +264,7 @@ var Zotero_RecognizePDF = new function() { this._items = items.slice(); this._itemTotal = items.length; - this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen"); + _progressWindow = this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen"); this._progressWindow.addEventListener("pageshow", function() { me._onWindowLoaded() }, false); }, @@ -398,7 +274,15 @@ var Zotero_RecognizePDF = new function() { "stop": function() { this._stopped = true; }, - + + /** + * Halts recognition and closes window + */ + "close": function() { + this.stop(); + this._progressWindow.close(); + }, + /** * Called when the progress window has been opened; adds items to the tree and begins recognizing * @param @@ -406,9 +290,11 @@ var Zotero_RecognizePDF = new function() { "_onWindowLoaded": function() { // populate progress window var treechildren = this._progressWindow.document.getElementById("treechildren"); + this._rowIDs = []; for(var i in this._items) { var treeitem = this._progressWindow.document.createElement('treeitem'); var treerow = this._progressWindow.document.createElement('treerow'); + this._rowIDs.push(this._items[i].id); var treecell = this._progressWindow.document.createElement('treecell'); treecell.setAttribute("id", "item-"+this._items[i].id+"-icon"); @@ -427,12 +313,22 @@ var Zotero_RecognizePDF = new function() { } var me = this; - this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator"); - this._progressWindow.document.getElementById("cancel-button").addEventListener("command", function() { - me.stop(); - me._progressWindow.close(); - }, false); - this._progressWindow.addEventListener("close", function() { me.stop() }, false); + + this._progressWindow.document.getElementById("tree").addEventListener( + "dblclick", function(event) { me._onDblClick(event, this); }); + + this._cancelHandler = function() { me.stop() }; + this._keypressCancelHandler = function(e) { + if(e.keyCode === KeyEvent.DOM_VK_ESCAPE) me.stop(); + }; + + _progressIndicator = this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator"); + this._progressWindow.document.getElementById("cancel-button") + .addEventListener("command", this._cancelHandler, false); + // Also cancel if the user presses Esc + this._progressWindow.addEventListener("keypress", this._keypressCancelHandler); + this._progressWindow.addEventListener("close", this._cancelHandler, false); + Zotero_RecognizePDF.GSFullTextSearch.resetQueryLimit(); this._recognizeItem(); }, @@ -452,23 +348,31 @@ var Zotero_RecognizePDF = new function() { return; } + // Order here matters. Otherwise we may show an incorrect label + if(this._stopped) { + this._done(true); + return; + } + this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100; var item = this._items.shift(), itemIcon = this._progressWindow.document.getElementById("item-"+item.id+"-icon"), - itemTitle = this._progressWindow.document.getElementById("item-"+item.id+"-title"); + itemTitle = this._progressWindow.document.getElementById("item-"+item.id+"-title"), + rowNumber = this._rowIDs.indexOf(item.id); itemIcon.setAttribute("src", LOADING_IMAGE); + itemTitle.setAttribute("label", ""); var file = item.getFile(), me = this; (file - ? Zotero_RecognizePDF.recognize(file, item.libraryID) + ? Zotero_RecognizePDF.recognize(file, item.libraryID, function() { return me._stopped; }) : Q.reject(new Zotero.Exception.Alert("recognizePDF.fileNotFound"))) .then(function(newItem) { // If already stopped, delete if(me._stopped) { - Zotero.Items.erase(item.id); - return; + Zotero.Items.erase(newItem.id); + throw new Zotero.Exception.Alert('recognizePDF.stopped'); } // put new item in same collections as the old one @@ -484,37 +388,504 @@ var Zotero_RecognizePDF = new function() { itemTitle.setAttribute("label", newItem.getField("title")); itemIcon.setAttribute("src", SUCCESS_IMAGE); + me._rowIDs[rowNumber] = newItem.id; me._recognizeItem(); - }, function(error) { + }) + .catch(function(error) { Zotero.debug(error); Zotero.logError(error); itemTitle.setAttribute("label", error instanceof Zotero.Exception.Alert ? error.message : Zotero.getString("recognizePDF.error")); itemIcon.setAttribute("src", FAILURE_IMAGE); - if(error instanceof Zotero.Exception.Alert && error.name === "recognizePDF.limit") { - me._done(); + // Don't show "completed" label if stopped on last item + if(me._stopped && !me._items.length) { + me._done(true); } else { me._recognizeItem(); } - }).fin(function() { + }).finally(function() { // scroll to this item - me._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, me._itemTotal-me._items.length-5)); + me._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, me._itemTotal-me._items.length-4)); }).done(); }, /** - * Cleans up after items are recognized, disabling the cancel button and making the progress window - * close on blur + * Cleans up after items are recognized, disabling the cancel button and + * making the progress window close on blur. + * @param {Boolean} cancelled Whether the process was cancelled */ - "_done": function() { + "_done": function(cancelled) { this._progressIndicator.value = 100; - this._progressWindow.document.getElementById("cancel-button").label = Zotero.getString("recognizePDF.close.label"); - var me = this; - this._progressWindow.addEventListener("blur", - function() { me._progressWindow.setTimeout(function() { me._progressWindow.close() }, 2000) }, false); - this._progressWindow.document.getElementById("label").value = Zotero.getString("recognizePDF.complete.label"); + // Switch out cancel for close + var cancelButton = this._progressWindow.document.getElementById("cancel-button"), + me = this; + cancelButton.label = Zotero.getString("recognizePDF.close.label"); + cancelButton.removeEventListener("command", this._cancelHandler, false); + cancelButton.addEventListener("command", function() { me.close() }, false); + this._progressWindow.removeEventListener("keypress", this._keypressCancelHandler); + this._progressWindow.addEventListener("keypress", function() { me.close() }); + + if(Zotero.isMac) { + // On MacOS X, the windows are not always on top, so we hide them on + // blur to avoid clutter + this._setCloseTimer(); + } + this._progressWindow.document.getElementById("label").value = + cancelled ? Zotero.getString("recognizePDF.cancelled.label") + : Zotero.getString("recognizePDF.complete.label"); + }, + + /** + * Set a timer after which the window will close automatically. If the + * window is refocused, clear the timer and do not attempt to auto-close + * any more + * @private + */ + "_setCloseTimer": function() { + var me = this, win = this._progressWindow; + var focusListener = function() { + if(!win.zoteroCloseTimeoutID) return; + + win.clearTimeout(win.zoteroCloseTimeoutID); + delete win.zoteroCloseTimeoutID; + + win.removeEventListener('blur', blurListener, false); + win.removeEventListener('focus', focusListener, false); + }; + var blurListener = function() { + // Close window after losing focus for 5 seconds + win.zoteroCloseTimeoutID = win.setTimeout(function() { win.close() }, 5000); + // Prevent auto-close if we gain focus again + win.addEventListener("focus", focusListener, false); + }; + win.addEventListener("blur", blurListener, false); + }, + + /** + * Focus items in Zotero library when double-clicking them in the Retrieve + * metadata window. + * @param {Event} event + * @param {tree} tree XUL tree object + * @private + */ + "_onDblClick": function(event, tree) { + if (event && tree && event.type == "dblclick") { + var itemID = this._rowIDs[tree.treeBoxObject.getRowAt(event.clientX, event.clientY)]; + if(!itemID) return; + + // Get the right window. In tab mode, it's the container window + var lastWin = (window.ZoteroTab ? window.ZoteroTab.containerWindow : window); + + if (lastWin.ZoteroOverlay) { + lastWin.ZoteroOverlay.toggleDisplay(true); + } + + lastWin.ZoteroPane.selectItem(itemID, false, true); + lastWin.focus(); + } } - } + }; + + /** + * Singleton for querying Google Scholar. Ensures that all queries are + * sequential and respect the delay inbetween queries. + * @namespace + */ + this.GSFullTextSearch = new function() { + const GOOGLE_SCHOLAR_QUERY_DELAY = 2000; // In ms + var queryLimitReached = false, + inProgress = false, + queue = [], + stopCheckCallback; // As long as we process one query at a time, this is ok + // Load nsICookieManager2 + Components.utils.import("resource://gre/modules/Services.jsm"); + var cookieService = Services.cookies; + + /** + * Reset "Query Limit Reached" flag, so that we attempt to query Google again + */ + this.resetQueryLimit = function() { + queryLimitReached = false; + }; + + /** + * Queue up item for Google Scholar query + * @param {String[]} lines Lines of text to use for full-text query + * @param {Integer | null} libraryID Library to save the item to + * @param {Function} stopCheckCallback Function that returns true if the + * process is to be interrupted + * @return {Promise} A promise resolved when PDF metadata has been retrieved + */ + this.findItem = function(lines, libraryID, stopCheckCallback) { + if(!inProgress && queryLimitReached) { + // There's no queue, so we can reject immediately + return Q.reject(new Zotero.Exception.Alert("recognizePDF.limit")); + } + + var deferred = Q.defer(); + queue.push({ + deferred: deferred, + lines: lines, + libraryID: libraryID, + stopCheckCallback: stopCheckCallback + }); + _processQueue(); + return deferred.promise; + }; + + /** + * Process Google Scholar queue + * @private + * @param {Boolean} proceed Whether we should pop the next item off the queue + * This should not be true unless being called after processing + * another item + */ + function _processQueue(proceed) { + if(inProgress && !proceed) return; //only one at a time + + if(!queue.length) { + inProgress = false; + return; + } + + inProgress = true; + if(queryLimitReached) { + // Irreversibly blocked. Reject remaining items in queue + var item; + while(item = queue.shift()) { + item.deferred.reject(new Zotero.Exception.Alert("recognizePDF.limit")); + } + _processQueue(true); // Wrap it up + } else { + var item = queue.shift(); + + stopCheckCallback = item.stopCheckCallback; + if(stopCheckCallback && stopCheckCallback()) { + item.deferred.reject(new Zotero.Exception.Alert('recognizePDF.stopped')); + _processQueue(true); + return; + } + + item.deferred.resolve( + Q.try(getGoodLines, item.lines) + .then(function(lines) { + return queryGoogle(lines, item.libraryID, 3); // Try querying 3 times + }) + .finally(function() { _processQueue(true); }) + ); + } + } + + /** + * Select lines that are good candidates for Google Scholar query + * @private + * @param {String[]} lines + * @return {String[]} + */ + function getGoodLines(lines) { + // Use only first column from multi-column lines + const lineRe = /^[\s_]*([^\s]+(?: [^\s_]+)+)/; + var cleanedLines = [], cleanedLineLengths = []; + for(var i=0; i<lines.length && cleanedLines.length<100; i++) { + var m = lineRe.exec(lines[i]); + if(m && m[1].split(' ').length > 3) { + cleanedLines.push(m[1]); + cleanedLineLengths.push(m[1].length); + } + } + + // Get (not quite) median length + var lineLengthsLength = cleanedLineLengths.length; + if(lineLengthsLength < 20 + || cleanedLines[0] === "This is a digital copy of a book that was preserved for generations on library shelves before it was carefully scanned by Google as part of a project") { + throw new Zotero.Exception.Alert("recognizePDF.noOCR"); + } + + var sortedLengths = cleanedLineLengths.sort(), + medianLength = sortedLengths[Math.floor(lineLengthsLength/2)]; + + // Pick lines within 6 chars of the median (this is completely arbitrary) + var goodLines = [], + uBound = medianLength + 6, + lBound = medianLength - 6; + for (var i=0; i<lineLengthsLength; i++) { + if(cleanedLineLengths[i] > lBound && cleanedLineLengths[i] < uBound) { + // Strip quotation marks so they don't mess up search query quoting + var line = cleanedLines[i].replace('"', ''); + goodLines.push(line); + } + } + return goodLines; + } + + /** + * Query Google Scholar + * @private + * @param {String[]} goodLines + * @param {Integer | null} libraryID + * @param {Integer} tries Number of queries to attempt before giving up + * @return {Promise} A promise resolved when PDF metadata has been retrieved + */ + function queryGoogle(goodLines, libraryID, tries) { + if(tries <= 0) throw new Zotero.Exception.Alert("recognizePDF.noMatches"); + + // Take the relevant parts of some lines (exclude hyphenated word) + var queryString = "", queryStringWords = 0, nextLine = 0; + while(queryStringWords < 25) { + if(!goodLines.length) throw new Zotero.Exception.Alert("recognizePDF.noMatches"); + + var words = goodLines.splice(nextLine, 1)[0].split(/\s+/); + // Try to avoid picking adjacent strings so the odds of them appearing in another + // document quoting our document is low. Every 7th line is a magic value + nextLine = (nextLine + 7) % goodLines.length; + + // Get rid of first and last words + words.shift(); + words.pop(); + // Make sure there are no long words (probably OCR mistakes) + var skipLine = false; + for(var i=0; i<words.length; i++) { + if(words[i].length > 20) { + skipLine = true; + break; + } + } + // Add words to query + if(!skipLine && words.length) { + queryStringWords += words.length; + queryString += '"'+words.join(" ")+'" '; + } + } + + Zotero.debug("RecognizePDF: Query string " + queryString); + + var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search", + delay = GOOGLE_SCHOLAR_QUERY_DELAY - (Date.now() - Zotero.HTTP.lastGoogleScholarQueryTime); + + // Delay + return (delay > 0 ? Q.delay(delay) : Q()) + .then(function() { + Zotero.HTTP.lastGoogleScholarQueryTime = Date.now(); + return Zotero.HTTP.promise("GET", url, {"responseType":"document"}) + }) + .then(function(xmlhttp) { + return _checkCaptchaOK(xmlhttp, 3); + }, + function(e) { + return _checkCaptchaError(e, 3); + }) + .then(function(xmlhttp) { + var doc = xmlhttp.response, + deferred = Q.defer(), + translate = new Zotero.Translate.Web(); + + translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); + translate.setDocument(Zotero.HTTP.wrapDocument(doc, url)); + translate.setHandler("translators", function(translate, detected) { + if(detected.length) { + deferred.resolve(_promiseTranslate(translate, libraryID)); + } else { + deferred.resolve(Q.try(function() { + return queryGoogle(goodLines, libraryID, tries-1); + })); + } + }); + translate.getTranslators(); + + return deferred.promise; + }) + .catch(function(e) { + if(e.name == "recognizePDF.limit") { + queryLimitReached = true; + } + throw e; + }); + } + + /** + * Check for CAPTCHA on a page with HTTP 200 status + * @private + * @param {XMLHttpRequest} xmlhttp + * @param {Integer} tries Number of queries to attempt before giving up + * @return {Promise} A promise resolved when PDF metadata has been retrieved + */ + function _checkCaptchaOK(xmlhttp, tries) { + if(stopCheckCallback && stopCheckCallback()) { + throw new Zotero.Exception.Alert('recognizePDF.stopped'); + } + + if(Zotero.Utilities.xpath(xmlhttp.response, "//form[@action='Captcha']").length) { + return _solveCaptcha(xmlhttp, tries); + } + return xmlhttp; + } + + /** + * Check for CAPTCHA on an error page. Handle 403 and 503 pages + * @private + * @param {Zotero.HTTP.UnexpectedStatusException} e HTTP response error object + * @param {Integer} tries Number of queries to attempt before giving up + * @param {Boolean} dontClearCookies Whether to attempt to clear cookies in + * in order to get CAPTCHA to show up + * @return {Promise} A promise resolved when PDF metadata has been retrieved + */ + function _checkCaptchaError(e, tries, dontClearCookies) { + if(stopCheckCallback && stopCheckCallback()) { + throw new Zotero.Exception.Alert('recognizePDF.stopped'); + } + + // Check for captcha on error page + if(e instanceof Zotero.HTTP.UnexpectedStatusException + && (e.status == 403 || e.status == 503) && e.xmlhttp.response) { + if(_extractCaptchaFormData(e.xmlhttp.response)) { + return _solveCaptcha(e.xmlhttp, tries); + } else if(!dontClearCookies && e.xmlhttp.channel) { // Make sure we can obtain original URL + // AFAICT, for 403 errors, GS just says "sorry, try later", + // but if you clear cookies, you get a CAPTCHA + if(!_clearGSCookies(e.xmlhttp.channel.originalURI.host)) { + //user said no or no cookies removed + throw new Zotero.Exception.Alert('recognizePDF.limit'); + } + // Redo GET request + return Zotero.HTTP.promise("GET", e.xmlhttp.channel.originalURI.spec, {"responseType":"document"}) + .then(function(xmlhttp) { + return _checkCaptchaOK(xmlhttp, tries); + }, + function(e) { + return _checkCaptchaError(e, tries, true); // Don't try this again + }); + } + + Zotero.debug("RecognizePDF: Google Scholar returned an unexpected page" + + " with status " + e.status); + throw new Zotero.Exception.Alert('recognizePDF.limit'); + } + throw e; + } + + /** + * Prompt user to enter CPATCHA + * @private + * @param {XMLHttpRequest} xmlhttp + * @param {Integer} [tries] Number of queries to attempt before giving up + * @return {Promise} A promise resolved when PDF metadata has been retrieved + */ + function _solveCaptcha(xmlhttp, tries) { + var doc = xmlhttp.response; + + if(tries === undefined) tries = 3; + + if(!tries) { + Zotero.debug("RecognizePDF: Failed to solve CAPTCHA after multiple attempts."); + throw new Zotero.Exception.Alert('recognizePDF.limit'); + } + + tries--; + var formData = doc && _extractCaptchaFormData(doc); + if(!formData) { + Zotero.debug("RecognizePDF: Could not find CAPTCHA on page."); + throw new Zotero.Exception.Alert('recognizePDF.limit'); + } + + var io = { dataIn: { + title: Zotero.getString("recognizePDF.captcha.title"), + description: Zotero.getString("recognizePDF.captcha.description"), + imgUrl: formData.img + }}; + + _progressWindow.openDialog("chrome://zotero/content/captcha.xul", "", + "chrome,modal,resizable=no,centerscreen", io); + + if(!io.dataOut) { + Zotero.debug("RecognizePDF: No CAPTCHA entered"); + throw new Zotero.Exception.Alert('recognizePDF.limit'); + } + + formData.input.captcha = io.dataOut.captcha; + var url = '', prop; + for(prop in formData.input) { + url += '&' + encodeURIComponent(prop) + '=' + + encodeURIComponent(formData.input[prop]); + } + + url = formData.action + '?' + url.substr(1); + + return Zotero.HTTP.promise("GET", url, {"responseType":"document"}) + .then(function(xmlhttp) { + return _checkCaptchaOK(xmlhttp, tries); + }, + function(e) { + return _checkCaptchaError(e, tries); + }); + } + + /** + * Extract CAPTCHA form-related data from the CAPTCHA page + * @private + * @param {Document} doc DOM document object for the CAPTCHA page + * @return {Object} Object containing data describing CAPTCHA form + */ + function _extractCaptchaFormData(doc) { + var formData = {}; + + var img = doc.getElementsByTagName('img')[0]; + if(!img) return; + formData.img = img.src; + + var form = doc.forms[0]; + if(!form) return; + + formData.action = form.action; + formData.input = {}; + var inputs = form.getElementsByTagName('input'); + for(var i=0, n=inputs.length; i<n; i++) { + if(!inputs[i].name) continue; + formData.input[inputs[i].name] = inputs[i].value; + } + + formData.continue = "http://scholar.google.com"; + + return formData; + } + + /** + * Clear Google cookies to get the CAPTCHA page to appear + * @private + * @param {String} host Host of the Google Scholar page (in case it's proxied) + * @return {Boolean} Whether any cookies were cleared + */ + function _clearGSCookies(host) { + /* There don't seem to be any negative effects of deleting GDSESS + if(!Zotero.isStandalone) { + //ask user first + var response = Components.classes["@mozilla.org/embedcomp/prompt-service;1"] + .getService(Components.interfaces.nsIPromptService) + .confirm(null, "Clear Google Scholar cookies?", + "Google Scholar is attempting to block further queries. We can " + + "clear certain cookies and try again. This may affect some " + + "temporary Google preferences or it may log you out. May we clear" + + " your Google Scholar cookies?"); + if(!response) return; + }*/ + + var removed = false, cookies = cookieService.getCookiesFromHost(host); + while(cookies.hasMoreElements()) { + var cookie = cookies.getNext().QueryInterface(Components.interfaces.nsICookie2); + if(["GDSESS", "PREF"].indexOf(cookie.name) !== -1) { // GDSESS doesn't seem to always be enough + Zotero.debug("RecognizePDF: Removing cookie " + cookie.name + " for host " + + cookie.host + " and path " + cookie.path); + cookieService.remove(cookie.host, cookie.name, cookie.path, false); + removed = true; + } + } + + if(!removed) { + Zotero.debug("RecognizePDF: No cookies removed"); + } + + return removed; + } + }; } \ No newline at end of file diff --git a/chrome/locale/en-US/zotero/zotero.dtd b/chrome/locale/en-US/zotero/zotero.dtd @@ -4,6 +4,8 @@ <!ENTITY zotero.general.deselectAll "Deselect All"> <!ENTITY zotero.general.edit "Edit"> <!ENTITY zotero.general.delete "Delete"> +<!ENTITY zotero.general.ok "OK"> +<!ENTITY zotero.general.cancel "Cancel"> <!ENTITY zotero.errorReport.title "Zotero Error Report"> <!ENTITY zotero.errorReport.unrelatedMessages "The error log may include messages unrelated to Zotero."> @@ -253,7 +255,6 @@ <!ENTITY zotero.recognizePDF.cancel.label "Cancel"> <!ENTITY zotero.recognizePDF.pdfName.label "PDF Name"> <!ENTITY zotero.recognizePDF.itemName.label "Item Name"> -<!ENTITY zotero.recognizePDF.captcha.label "Type the text below to continue retrieving metadata."> <!ENTITY zotero.rtfScan.title "RTF Scan"> <!ENTITY zotero.rtfScan.cancel.label "Cancel"> @@ -282,4 +283,4 @@ <!ENTITY zotero.downloadManager.label "Save to Zotero"> <!ENTITY zotero.downloadManager.saveToLibrary.description "Attachments cannot be saved to the currently selected library. This item will be saved to your library instead."> -<!ENTITY zotero.downloadManager.noPDFTools.description "To use this feature, you must first install the PDF tools in the Search pane of the Zotero preferences."> +<!ENTITY zotero.downloadManager.noPDFTools.description "To use this feature, you must first install the PDF tools in the Search pane of the Zotero preferences."> +\ No newline at end of file diff --git a/chrome/locale/en-US/zotero/zotero.properties b/chrome/locale/en-US/zotero/zotero.properties @@ -895,12 +895,16 @@ proxies.recognized.add = Add Proxy recognizePDF.noOCR = PDF does not contain OCRed text. recognizePDF.couldNotRead = Could not read text from PDF. -recognizePDF.noMatches = No matching references found. -recognizePDF.fileNotFound = File not found. -recognizePDF.limit = Query limit reached. Try again later. +recognizePDF.noMatches = No matching references found +recognizePDF.fileNotFound = File not found +recognizePDF.limit = Google Scholar query limit reached. Try again later. recognizePDF.error = An unexpected error occurred. -recognizePDF.complete.label = Metadata Retrieval Complete. +recognizePDF.stopped = Cancelled +recognizePDF.complete.label = Metadata Retrieval Complete +recognizePDF.cancelled.label = Metadata Retrieval Cancelled recognizePDF.close.label = Close +recognizePDF.captcha.title = Please enter CAPTCHA +recognizePDF.captcha.description = Zotero uses Google Scholar to help identify PDFs. To continue using Google Scholar, please enter the text from the image below. rtfScan.openTitle = Select a file to scan rtfScan.scanning.label = Scanning RTF Document… diff --git a/chrome/skin/default/zotero/zotero.css b/chrome/skin/default/zotero/zotero.css @@ -303,7 +303,6 @@ label.zotero-text-link { margin-bottom: 1em; } - .zotero-small-progress-indicator { list-style-image: url(chrome://global/skin/icons/notloading_16.png); margin-left: -2px; @@ -316,4 +315,19 @@ label.zotero-text-link { #zotero-note-window { padding-bottom: 4px; +} + +#zotero-captcha-description { + max-width: 300px; + padding-bottom: 4px; + text-align: justify; +} + +#zotero-captcha-error { + max-width: 300px; + padding-bottom: 4px; + padding-top: 4px; + font-weight: bold; + color: red; + text-align: center; } \ No newline at end of file