www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 5fc97828605648241b0c7dd55dd9a3919be0c486
parent c9f15927048998ad4810d62cc432f11f99e88df0
Author: Simon Kornblith <simon@simonster.com>
Date:   Wed,  3 Sep 2008 06:20:19 +0000

- better interface for PDF recognizer
- slightly improved PDF recognizer performance


Diffstat:
Achrome/content/zotero/pdfProgress.xul | 23+++++++++++++++++++++++
Mchrome/content/zotero/recognizePDF.js | 323++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Mchrome/locale/en-US/zotero/zotero.dtd | 7+++++++
Mchrome/locale/en-US/zotero/zotero.properties | 10++++++++--
Achrome/skin/default/zotero/cross.png | 0
Achrome/skin/default/zotero/indicator.gif | 0
6 files changed, 294 insertions(+), 69 deletions(-)

diff --git a/chrome/content/zotero/pdfProgress.xul b/chrome/content/zotero/pdfProgress.xul @@ -0,0 +1,23 @@ +<?xml version="1.0" ?> +<?xml-stylesheet href="chrome://global/skin/" type="text/css"?> +<!DOCTYPE window SYSTEM "chrome://zotero/locale/zotero.dtd"> + +<window xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul" + title="&zotero.progress.title;" width="550" height="230" + id="zotero-progress"> + <vbox style="padding:10px" flex="1"> + <label id="label" control="progress-indicator" value="&zotero.recognizePDF.recognizing.label;"/> + <hbox align="center"> + <progressmeter id="progress-indicator" mode="determined" flex="1"/> + <button id="cancel-button" label="&zotero.recognizePDF.cancel.label;"/> + </hbox> + <tree flex="1" id="tree" hidecolumnpicker="true"> + <treecols> + <treecol id="success-col" style="width:20px;"/> + <treecol label="&zotero.recognizePDF.pdfName.label;" id="pdf-col" flex="1"/> + <treecol label="&zotero.recognizePDF.itemName.label;" id="item-col" flex="2"/> + </treecols> + <treechildren id="treechildren"/> + </tree> + </vbox> +</window> diff --git a/chrome/content/zotero/recognizePDF.js b/chrome/content/zotero/recognizePDF.js @@ -23,18 +23,24 @@ /** * @fileOverview Tools for automatically retrieving a citation for the given PDF */ +const Zotero_RecognizePDF_SUCCESS_IMAGE = "chrome://zotero/skin/tick.png"; +const Zotero_RecognizePDF_FAILURE_IMAGE = "chrome://zotero/skin/cross.png"; +const Zotero_RecognizePDF_LOADING_IMAGE = "chrome://zotero/skin/indicator.gif"; /** * Front end for recognizing PDFs * @namespace */ var Zotero_RecognizePDF = new function() { + var _progressWindow, _progressIndicator; + /** * Checks whether a given PDF could theoretically be recognized * @returns {Boolean} True if the PDF can be recognized, false if it cannot be */ this.canRecognize = function(/**Zotero.Item*/ item) { - return (item.attachmentMIMEType && item.attachmentMIMEType == "application/pdf" && !item.getSource()); + return (Zotero.Fulltext.pdfConverterIsRegistered && item.attachmentMIMEType && + item.attachmentMIMEType == "application/pdf" && !item.getSource()); } /** @@ -43,43 +49,178 @@ var Zotero_RecognizePDF = new function() { */ this.recognizeSelected = function() { var items = ZoteroPane.getSelectedItems(); - if (!items) { - return; + if (!items) return; + var itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer(); + itemRecognizer.recognizeItems(items); + } +} + +/** + * @class Handles UI, etc. for recognizing multiple items + */ +Zotero_RecognizePDF.ItemRecognizer = function () { + this._stopped = false; +} + +/** + * Retreives metadata for the PDF items passed, displaying a progress dialog during conversion + * and placing the PDFs as a children of the new items + * @param {Zotero.Item[]} items + */ +Zotero_RecognizePDF.ItemRecognizer.prototype.recognizeItems = function(items) { + var me = this; + this._items = items.slice(); + this._itemTotal = items.length; + + this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen"); + this._progressWindow.addEventListener("pageshow", function() { me._onWindowLoaded() }, false); +} + +/** + * Halts recognition of PDFs + */ +Zotero_RecognizePDF.ItemRecognizer.prototype.stop = function() { + this._stopped = true; +} + +/** + * Called when the progress window has been opened; adds items to the tree and begins recognizing + * @param + */ +Zotero_RecognizePDF.ItemRecognizer.prototype._onWindowLoaded = function() { + // populate progress window + var treechildren = this._progressWindow.document.getElementById("treechildren"); + for(var i in this._items) { + var treeitem = this._progressWindow.document.createElement('treeitem'); + var treerow = this._progressWindow.document.createElement('treerow'); + + var treecell = this._progressWindow.document.createElement('treecell'); + treecell.setAttribute("id", "item-"+this._items[i].id+"-icon"); + treerow.appendChild(treecell); + + treecell = this._progressWindow.document.createElement('treecell'); + treecell.setAttribute("label", this._items[i].getField("title")); + treerow.appendChild(treecell); + + treecell = this._progressWindow.document.createElement('treecell'); + treecell.setAttribute("id", "item-"+this._items[i].id+"-title"); + treerow.appendChild(treecell); + + treeitem.appendChild(treerow); + treechildren.appendChild(treeitem); + } + + var me = this; + this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator"); + this._progressWindow.document.getElementById("cancel-button").addEventListener("command", function() { + me.stop(); + me._progressWindow.close(); + }, false); + this._progressWindow.addEventListener("close", function() { me.stop() }, false); + this._recognizeItem(); +} + +/** + * Shifts an item off of this._items and recognizes it, then calls itself again if there are more + * @private + */ +Zotero_RecognizePDF.ItemRecognizer.prototype._recognizeItem = function() { + if(!this._items.length) { + this._done(); + return; + } + + this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100; + this._item = this._items.shift(); + + this._progressWindow.document.getElementById("item-"+this._item.id+"-icon"). + setAttribute("src", Zotero_RecognizePDF_LOADING_IMAGE); + + var file = this._item.getFile(); + if(file) { + var recognizer = new Zotero_RecognizePDF.Recognizer(); + var me = this; + recognizer.recognize(file, function(newItem, error) { me._callback(newItem, error) }); + } else { + this._callback(false, "recognizePDF.fileNotFound"); + } +} + +/** + * Cleans up after items are recognized, disabling the cancel button and making the progress window + * close on blur + */ +Zotero_RecognizePDF.ItemRecognizer.prototype._done = function() { + this._progressIndicator.value = 100; + this._progressWindow.document.getElementById("cancel-button").label = Zotero.getString("recognizePDF.close.label"); + var me = this; + this._progressWindow.addEventListener("blur", + function() { me._progressWindow.setTimeout(function() { me._progressWindow.close() }, 2000) }, false); + this._progressWindow.document.getElementById("label").value = Zotero.getString("recognizePDF.complete.label"); +} + +/** + * Callback function to be executed upon recognition completion + * @param {Zotero.Item|Boolean} newItem The new item created from translation, or false if + * recognition was unsuccessful + * @param {String} [error] The error name, if recognition was unsuccessful. + */ +Zotero_RecognizePDF.ItemRecognizer.prototype._callback = function(newItem, error) { + if(this._stopped) { + if(newItem) Zotero.Items.erase(newItem.id); + return; + } + + if(newItem) { + // put new item in same collections as the old one + var itemCollections = this._item.getCollections(); + for(var j=0; j<itemCollections.length; j++) { + var collection = Zotero.Collections.get(itemCollections[j]); + collection.addItem(newItem.id); } - this.recognizeItems(items); + + // put old item as a child of the new item + this._item.setSource(newItem.id); + this._item.save(); } + + // add name + this._progressWindow.document.getElementById("item-"+this._item.id+"-title"). + setAttribute("label", (newItem ? newItem.getField("title") : Zotero.getString(error))); + // update icon + this._progressWindow.document.getElementById("item-"+this._item.id+"-icon"). + setAttribute("src", (newItem ? Zotero_RecognizePDF_SUCCESS_IMAGE : Zotero_RecognizePDF_FAILURE_IMAGE)); - /** - * Retreives metadata for the PDF items passed, placing the PDFs as a children of the new items - */ - this.recognizeItems = function(/**Zotero.Item[]*/ items) { - var itemsCopy = items.slice(); - var item = itemsCopy.shift(); - var file = item.getFile(); - if(file) { - var recognizer = new Zotero_RecognizePDF.Recognizer(); - recognizer.recognize(file, item.getField("title"), - function(translate, newItem) { - // put new item in same collections as the old one - var itemCollections = item.getCollections(); - for(var j=0; j<itemCollections.length; j++) { - var collection = Zotero.Collections.get(itemCollections[j]); - collection.addItem(newItem.id); - } - - // put old item as a child of the new item - item.setSource(newItem.id); - item.save(); - - // continue recognizing - if(itemsCopy.length) Zotero_RecognizePDF.recognizeItems(itemsCopy); - }); - } else { - if(itemsCopy.length) Zotero_RecognizePDF.recognizeItems(itemsCopy); + if(error == "recognizePDF.limit") { + // now done, since we hit the query limit + var error = Zotero.getString(error); + for(var i in this._items) { + this._progressWindow.document.getElementById("item-"+this._items[i].id+"-title"). + setAttribute("label", error); + this._progressWindow.document.getElementById("item-"+this._items[i].id+"-icon"). + setAttribute("src", Zotero_RecognizePDF_FAILURE_IMAGE); } + this._done(); + } else { + // scroll to this item + this._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, this._itemTotal-this._items.length-5)); + // continue recognizing + this._recognizeItem(); } } +/*Zotero_RecognizePDF.ItemRecognizer.prototype._captchaCallback = function(img) { + var io = {dataIn:img}; + Zotero.debug(img); + this._progressWindow.openDialog("chrome://zotero/content/pdfCaptcha.xul", "", "chrome,modal,resizable=no", io); + + if(io.dataOut) return io.dataOut; + + this.stop(); + this._progressWindow.close(); + return false; +}*/ + /** * @class PDF recognizer backend */ @@ -89,19 +230,23 @@ Zotero_RecognizePDF.Recognizer = function () {} * Retrieves metadata for a PDF and saves it as an item * * @param {nsIFile} file The PDF file to retrieve metadata for - * @param {String} pdfTitle The title of the PDF * @param {Function} callback The function to be executed when recognition is complete + * @param {Function} [captchaCallback] The function to be executed if a CAPTCHA is encountered + * (function will be passed image as URL and must return text of CAPTCHA) */ -Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, callback) { +Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, callback, captchaCallback) { const MAX_PAGES = 2; - this._pdfTitle = pdfTitle; - this._callback = callback; + const lineRe = /^\s*([^\s]+(?: [^\s]+)+)/; - const whitespaceRe = /^\s*$/; + this._callback = callback; + //this._captchaCallback = captchaCallback; var cacheFile = Zotero.getZoteroDirectory(); - cacheFile.append("recognizePDFcache.txt"); + cacheFile.append(".zotero-recpdf-cache"); + if(cacheFile.exists()) { + cacheFile.remove(false); + } Zotero.debug('Running pdftotext -enc UTF-8 -nopgbrk ' + '-l ' + MAX_PAGES + ' "' + file.path + '" "' @@ -113,10 +258,15 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, ca exec.append(Zotero.Fulltext.pdfConverterFileName); proc.init(exec); - var args = ['-enc', 'UTF-8', '-nopgbrk', '-raw', '-l', MAX_PAGES]; + var args = ['-enc', 'UTF-8', '-nopgbrk', '-layout', '-l', MAX_PAGES]; args.push(file.path, cacheFile.path); proc.run(true, args, args.length); + if(!cacheFile.exists()) { + this._callback(false, "recognizePDF.couldNotRead"); + return; + } + var inputStream = Components.classes["@mozilla.org/network/file-input-stream;1"] .createInstance(Components.interfaces.nsIFileInputStream); inputStream.init(cacheFile, 0x01, 0664, 0); @@ -131,31 +281,34 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, ca var lineLengths = []; var str = {}; while(intlStream.readLine(str)) { - if(!whitespaceRe.test(str.value)) { - lines.push(str.value); - lineLengths.push(str.value.length); + var line = lineRe.exec(str.value); + if(line) { + lines.push(line[1]); + lineLengths.push(line[1].length); } } // get (not quite) median length var lineLengthsLength = lineLengths.length; if(lineLengthsLength < 20) { - this._error(); + this._callback(false, "recognizePDF.noOCR"); return; } var sortedLengths = lineLengths.sort(); var medianLength = sortedLengths[Math.floor(lineLengthsLength/2)]; - // pick lines within 4 chars of the median + // pick lines within 4 chars of the median (this is completely arbitrary) this._goodLines = []; var uBound = medianLength + 4; var lBound = medianLength - 4; for (var i=0; i<lineLengthsLength; i++) { - if(lineLengths[i] >= lBound && lineLengths[i] <= uBound) this._goodLines.push(lines[i]); + if(lineLengths[i] > lBound && lineLengths[i] < uBound) this._goodLines.push(lines[i]); } this._startLine = this._iteration = 0; + + cacheFile.remove(false); this._queryGoogle(); } @@ -165,18 +318,32 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, ca */ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() { if(this._iteration > 3 || this._startLine >= this._goodLines.length) { - this._error(); + try { + if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser); + } catch(e) {} + this._callback(false, "recognizePDF.noMatches"); return; } + this._iteration++; // take the relevant parts of some lines (exclude hyphenated word) var queryStringWords = 0; var queryString = ""; while(queryStringWords < 25 && this._startLine < this._goodLines.length) { var words = this._goodLines[this._startLine].split(/\s+/); + // get rid of first and last words words.shift(); words.pop(); - if(words.length) { + // make sure there are no long words (probably OCR mistakes) + var skipLine = false; + for(var i=0; i<words.length; i++) { + if(words[i].length > 20) { + skipLine = true; + break; + } + } + // add words to query + if(!skipLine && words.length) { queryStringWords += words.length; queryString += '"'+words.join(" ")+'" '; } @@ -185,27 +352,61 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() { Zotero.debug("RecognizePDF: Query string "+queryString); // pass query string to Google Scholar and translate - var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString); - this.hiddenBrowser = Zotero.Browser.createHiddenBrowser(); + var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search"; + if(!this._hiddenBrowser) { + this._hiddenBrowser = Zotero.Browser.createHiddenBrowser(); + this._hiddenBrowser.docShell.allowImages = false; + } var me = this; var translate = new Zotero.Translate("web", true, false); translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289"); - translate.setHandler("itemDone", this._callback); + translate.setHandler("itemDone", function(translate, item) { + Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser); + me._callback(item); + }); translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) }); - translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle() }); + translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); }); + + this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true); + + // to make us a little less obvious, specify a referrer + var referrer = Components.classes["@mozilla.org/network/io-service;1"] + .getService(Components.interfaces.nsIIOService) + .newURI(this._previousURL ? this._previousURL : "http://scholar.google.com/", null, null); + this._hiddenBrowser.loadURIWithFlags(url, + Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, referrer, null, null); - this.hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true); - this.hiddenBrowser.loadURI(url); + this._previousURL = url; } /** - * Callback to be executed when Google Scholar is loaded + * To be executed when Google Scholar is loaded * @private */ Zotero_RecognizePDF.Recognizer.prototype._scrape = function(/**Zotero.Translate*/ translate) { - this.hiddenBrowser.removeEventListener("pageshow", this._scrape.caller, true); - translate.setDocument(this.hiddenBrowser.contentDocument); + if(this._hiddenBrowser.contentDocument.title == "403 Forbidden") { + // hit the captcha + /* + var forms = this._hiddenBrowser.contentDocument.getElementsByTagName("form"); + if(forms.length && forms[0].getAttribute("action") == "Captcha") { + var captchaImage = forms[0].getElementsByTagName("img"); + var captchaBox = this._hiddenBrowser.contentDocument.getElementsByName("captcha"); + if(captchaImage.length && captchaBox.length && this._captchaCallback) { + var text = this._captchaCallback(captchaImage[0].src); + if(text) { + captchaBox[0].value = text; + forms[0].submit(); + return; + } + } + }*/ + this._callback(false, "recognizePDF.limit"); + return; + } + + this._hiddenBrowser.removeEventListener("pageshow", this._scrape.caller, true); + translate.setDocument(this._hiddenBrowser.contentDocument); translate.translate(); } @@ -220,16 +421,4 @@ Zotero_RecognizePDF.Recognizer.prototype._selectItems = function(/**Zotero.Trans obj[i] = items; return obj; } -} - -/** - * Displays an error when a PDF cannot be recognized - * @private - */ -Zotero_RecognizePDF.Recognizer.prototype._error = function() { - var promptService = Components.classes["@mozilla.org/embedcomp/prompt-service;1"] - .getService(Components.interfaces.nsIPromptService); - promptService.alert(window, - Zotero.getString('recognizePDF.couldNotRecognize.title'), - Zotero.getString('recognizePDF.couldNotRecognize.message', this._pdfTitle)); } \ No newline at end of file diff --git a/chrome/locale/en-US/zotero/zotero.dtd b/chrome/locale/en-US/zotero/zotero.dtd @@ -164,3 +164,9 @@ <!ENTITY zotero.proxy.recognized.warning.secondary "Adding other proxies allows malicious sites to masquerade as sites you trust."> <!ENTITY zotero.proxy.recognized.disable.label "Do not automatically redirect requests through previously recognized proxies"> <!ENTITY zotero.proxy.recognized.ignore.label "Ignore"> + +<!ENTITY zotero.recognizePDF.recognizing.label "Retrieving Metadata..."> +<!ENTITY zotero.recognizePDF.cancel.label "Cancel"> +<!ENTITY zotero.recognizePDF.pdfName.label "PDF Name"> +<!ENTITY zotero.recognizePDF.itemName.label "Item Name"> +<!ENTITY zotero.recognizePDF.captcha.label "Type the text below to continue retrieving metadata."> +\ No newline at end of file diff --git a/chrome/locale/en-US/zotero/zotero.properties b/chrome/locale/en-US/zotero/zotero.properties @@ -516,5 +516,10 @@ proxies.recognized.add = Add Proxy proxies.enableTransparentWarning.title = Warning proxies.enableTransparentWarning.description = Please ensure that the proxies listed below belong to a library, school, or other institution with which you are affiliated. A malicious proxy could pose a security risk. -recognizePDF.couldNotRecognize.title = Could Not Retrieve Metada -recognizePDF.couldNotRecognize.message = Zotero could not retrieve metadata for "%1$S". +recognizePDF.noOCR = PDF does not contain OCRed text. +recognizePDF.couldNotRead = Could not read text from PDF. +recognizePDF.noMatches = No matching references found. +recognizePDF.fileNotFound = File not found. +recognizePDF.limit = Query limit reached. Try again later. +recognizePDF.complete.label = Metadata Retrieval Complete. +recognizePDF.close.label = Close +\ No newline at end of file diff --git a/chrome/skin/default/zotero/cross.png b/chrome/skin/default/zotero/cross.png Binary files differ. diff --git a/chrome/skin/default/zotero/indicator.gif b/chrome/skin/default/zotero/indicator.gif Binary files differ.