- better interface for PDF recognizer - slightly improved PDF recognizer performance - www - Unnamed repository; edit this file 'description' to name the repository.

commit 5fc97828605648241b0c7dd55dd9a3919be0c486
parent c9f15927048998ad4810d62cc432f11f99e88df0
Author: Simon Kornblith <simon@simonster.com>
Date:   Wed,  3 Sep 2008 06:20:19 +0000

- better interface for PDF recognizer
- slightly improved PDF recognizer performance


Diffstat:
A chrome/content/zotero/pdfProgress.xul  | 23 +++++++++++++++++++++++
M chrome/content/zotero/recognizePDF.js  | 323 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M chrome/locale/en-US/zotero/zotero.dtd  | 7 +++++++
M chrome/locale/en-US/zotero/zotero.properties  | 10 ++++++++--
A chrome/skin/default/zotero/cross.png  | 0 
A chrome/skin/default/zotero/indicator.gif  | 0

6 files changed, 294 insertions(+), 69 deletions(-)
diff --git a/chrome/content/zotero/pdfProgress.xul b/chrome/content/zotero/pdfProgress.xul
@@ -0,0 +1,23 @@
+<?xml version="1.0" ?>
+<?xml-stylesheet href="chrome://global/skin/" type="text/css"?>
+<!DOCTYPE window SYSTEM "chrome://zotero/locale/zotero.dtd">
+
+<window xmlns="http://www.mozilla.org/keymaster/gatekeeper/there.is.only.xul"
+	title="&zotero.progress.title;" width="550" height="230"
+	id="zotero-progress">
+	<vbox style="padding:10px" flex="1">
+		<label id="label" control="progress-indicator" value="&zotero.recognizePDF.recognizing.label;"/>
+		<hbox align="center">
+			<progressmeter id="progress-indicator" mode="determined" flex="1"/>
+			<button id="cancel-button" label="&zotero.recognizePDF.cancel.label;"/>
+		</hbox>
+		<tree flex="1" id="tree" hidecolumnpicker="true">
+			<treecols>
+				<treecol id="success-col" style="width:20px;"/>
+				<treecol label="&zotero.recognizePDF.pdfName.label;" id="pdf-col" flex="1"/>
+				<treecol label="&zotero.recognizePDF.itemName.label;" id="item-col" flex="2"/>
+			</treecols>
+			<treechildren id="treechildren"/>
+		</tree>
+	</vbox>
+</window>
diff --git a/chrome/content/zotero/recognizePDF.js b/chrome/content/zotero/recognizePDF.js
@@ -23,18 +23,24 @@
 /**
  * @fileOverview Tools for automatically retrieving a citation for the given PDF
  */
+const Zotero_RecognizePDF_SUCCESS_IMAGE = "chrome://zotero/skin/tick.png";
+const Zotero_RecognizePDF_FAILURE_IMAGE = "chrome://zotero/skin/cross.png";
+const Zotero_RecognizePDF_LOADING_IMAGE = "chrome://zotero/skin/indicator.gif";
  
 /**
  * Front end for recognizing PDFs
  * @namespace
  */
 var Zotero_RecognizePDF = new function() {
+	var _progressWindow, _progressIndicator;
+	
 	/**
 	 * Checks whether a given PDF could theoretically be recognized
 	 * @returns {Boolean} True if the PDF can be recognized, false if it cannot be
 	 */
 	this.canRecognize = function(/**Zotero.Item*/ item) {
-		return (item.attachmentMIMEType && item.attachmentMIMEType == "application/pdf" && !item.getSource());
+		return (Zotero.Fulltext.pdfConverterIsRegistered && item.attachmentMIMEType &&
+			item.attachmentMIMEType == "application/pdf" && !item.getSource());
 	}
 	
 	/**
@@ -43,43 +49,178 @@ var Zotero_RecognizePDF = new function() {
 	 */
 	this.recognizeSelected = function() {
 		var items = ZoteroPane.getSelectedItems();
-		if (!items) {
-			return;
+		if (!items) return;
+		var itemRecognizer = new Zotero_RecognizePDF.ItemRecognizer();
+		itemRecognizer.recognizeItems(items);
+	}
+}
+
+/**
+ * @class Handles UI, etc. for recognizing multiple items
+ */
+Zotero_RecognizePDF.ItemRecognizer = function () {
+	this._stopped = false;
+}
+
+/**
+ * Retreives metadata for the PDF items passed, displaying a progress dialog during conversion 
+ * and placing the PDFs as a children of the new items
+ * @param {Zotero.Item[]} items
+ */
+Zotero_RecognizePDF.ItemRecognizer.prototype.recognizeItems = function(items) {
+	var me = this;
+	this._items = items.slice();
+	this._itemTotal = items.length;
+	
+	this._progressWindow = window.openDialog("chrome://zotero/content/pdfProgress.xul", "", "chrome,close=yes,resizable=yes,dependent,dialog,centerscreen");
+	this._progressWindow.addEventListener("pageshow", function() { me._onWindowLoaded() }, false);
+}
+
+/**
+ * Halts recognition of PDFs
+ */
+Zotero_RecognizePDF.ItemRecognizer.prototype.stop = function() {
+	this._stopped = true;	
+}
+
+/**
+ * Called when the progress window has been opened; adds items to the tree and begins recognizing
+ * @param
+ */
+Zotero_RecognizePDF.ItemRecognizer.prototype._onWindowLoaded = function() {
+	// populate progress window
+	var treechildren = this._progressWindow.document.getElementById("treechildren");
+	for(var i in this._items) {
+		var treeitem = this._progressWindow.document.createElement('treeitem');
+		var treerow = this._progressWindow.document.createElement('treerow');
+		
+		var treecell = this._progressWindow.document.createElement('treecell');
+		treecell.setAttribute("id", "item-"+this._items[i].id+"-icon");
+		treerow.appendChild(treecell);
+		
+		treecell = this._progressWindow.document.createElement('treecell');
+		treecell.setAttribute("label", this._items[i].getField("title"));
+		treerow.appendChild(treecell);
+		
+		treecell = this._progressWindow.document.createElement('treecell');
+		treecell.setAttribute("id", "item-"+this._items[i].id+"-title");
+		treerow.appendChild(treecell);
+		
+		treeitem.appendChild(treerow);
+		treechildren.appendChild(treeitem);
+	}
+	
+	var me = this;
+	this._progressIndicator = this._progressWindow.document.getElementById("progress-indicator");
+	this._progressWindow.document.getElementById("cancel-button").addEventListener("command", function() {
+		me.stop();
+		me._progressWindow.close();
+	}, false);
+	this._progressWindow.addEventListener("close", function() { me.stop() }, false);
+	this._recognizeItem();
+}
+
+/**
+ * Shifts an item off of this._items and recognizes it, then calls itself again if there are more
+ * @private
+ */
+Zotero_RecognizePDF.ItemRecognizer.prototype._recognizeItem = function() {
+	if(!this._items.length) {
+		this._done();
+		return;
+	}
+	
+	this._progressIndicator.value = (this._itemTotal-this._items.length)/this._itemTotal*100;
+	this._item = this._items.shift();
+	
+	this._progressWindow.document.getElementById("item-"+this._item.id+"-icon").
+		setAttribute("src", Zotero_RecognizePDF_LOADING_IMAGE);
+	
+	var file = this._item.getFile();
+	if(file) {
+		var recognizer = new Zotero_RecognizePDF.Recognizer();
+		var me = this;
+		recognizer.recognize(file, function(newItem, error) { me._callback(newItem, error) });
+	} else {
+		this._callback(false, "recognizePDF.fileNotFound");
+	}
+}
+
+/**
+ * Cleans up after items are recognized, disabling the cancel button and making the progress window
+ * close on blur
+ */
+Zotero_RecognizePDF.ItemRecognizer.prototype._done = function() {
+	this._progressIndicator.value = 100;
+	this._progressWindow.document.getElementById("cancel-button").label = Zotero.getString("recognizePDF.close.label");
+	var me = this;
+	this._progressWindow.addEventListener("blur",
+		function() { me._progressWindow.setTimeout(function() { me._progressWindow.close() }, 2000) }, false);
+	this._progressWindow.document.getElementById("label").value = Zotero.getString("recognizePDF.complete.label");
+}
+
+/**
+ * Callback function to be executed upon recognition completion
+ * @param {Zotero.Item|Boolean} newItem The new item created from translation, or false if
+ *	recognition was unsuccessful
+ * @param {String} [error] The error name, if recognition was unsuccessful.
+ */
+Zotero_RecognizePDF.ItemRecognizer.prototype._callback = function(newItem, error) {
+	if(this._stopped) {
+		if(newItem) Zotero.Items.erase(newItem.id);
+		return;
+	}
+	
+	if(newItem) {
+		// put new item in same collections as the old one
+		var itemCollections = this._item.getCollections();
+		for(var j=0; j<itemCollections.length; j++) {
+			var collection = Zotero.Collections.get(itemCollections[j]);
+			collection.addItem(newItem.id);
 		}
-		this.recognizeItems(items);
+		
+		// put old item as a child of the new item
+		this._item.setSource(newItem.id);
+		this._item.save();	
 	}
+		
+	// add name
+	this._progressWindow.document.getElementById("item-"+this._item.id+"-title").
+		setAttribute("label", (newItem ? newItem.getField("title") : Zotero.getString(error)));
+	// update icon
+	this._progressWindow.document.getElementById("item-"+this._item.id+"-icon").
+		setAttribute("src", (newItem ? Zotero_RecognizePDF_SUCCESS_IMAGE : Zotero_RecognizePDF_FAILURE_IMAGE));
 	
-	/**
-	 * Retreives metadata for the PDF items passed, placing the PDFs as a children of the new items
-	 */
-	this.recognizeItems = function(/**Zotero.Item[]*/ items) {
-		var itemsCopy = items.slice();
-		var item = itemsCopy.shift();
-		var file = item.getFile();
-		if(file) {
-			var recognizer = new Zotero_RecognizePDF.Recognizer();
-			recognizer.recognize(file, item.getField("title"),
-				function(translate, newItem) {
-					// put new item in same collections as the old one
-					var itemCollections = item.getCollections();
-					for(var j=0; j<itemCollections.length; j++) {
-						var collection = Zotero.Collections.get(itemCollections[j]);
-						collection.addItem(newItem.id);
-					}
-					
-					// put old item as a child of the new item
-					item.setSource(newItem.id);
-					item.save();
-					
-					// continue recognizing
-					if(itemsCopy.length) Zotero_RecognizePDF.recognizeItems(itemsCopy);
-				});
-		} else {
-			if(itemsCopy.length) Zotero_RecognizePDF.recognizeItems(itemsCopy);
+	if(error == "recognizePDF.limit") {
+		// now done, since we hit the query limit
+		var error = Zotero.getString(error);
+		for(var i in this._items) {
+			this._progressWindow.document.getElementById("item-"+this._items[i].id+"-title").
+				setAttribute("label", error);
+			this._progressWindow.document.getElementById("item-"+this._items[i].id+"-icon").
+				setAttribute("src", Zotero_RecognizePDF_FAILURE_IMAGE);
 		}
+		this._done();
+	} else {
+		// scroll to this item
+		this._progressWindow.document.getElementById("tree").treeBoxObject.scrollToRow(Math.max(0, this._itemTotal-this._items.length-5));
+		// continue recognizing
+		this._recognizeItem();
 	}
 }
 
+/*Zotero_RecognizePDF.ItemRecognizer.prototype._captchaCallback = function(img) {
+	var io = {dataIn:img};
+	Zotero.debug(img);
+	this._progressWindow.openDialog("chrome://zotero/content/pdfCaptcha.xul", "", "chrome,modal,resizable=no", io);
+	
+	if(io.dataOut) return io.dataOut;
+	
+	this.stop();
+	this._progressWindow.close();
+	return false;
+}*/
+
 /**
  * @class PDF recognizer backend
  */
@@ -89,19 +230,23 @@ Zotero_RecognizePDF.Recognizer = function () {}
  * Retrieves metadata for a PDF and saves it as an item
  *
  * @param {nsIFile} file The PDF file to retrieve metadata for
- * @param {String} pdfTitle The title of the PDF
  * @param {Function} callback The function to be executed when recognition is complete
+ * @param {Function} [captchaCallback] The function to be executed if a CAPTCHA is encountered
+ *	(function will be passed image as URL and must return text of CAPTCHA)
  */
-Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, callback) {
+Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, callback, captchaCallback) {
 	const MAX_PAGES = 2;
 	
-	this._pdfTitle = pdfTitle;
-	this._callback = callback;
+	const lineRe = /^\s*([^\s]+(?: [^\s]+)+)/;
 	
-	const whitespaceRe = /^\s*$/;
+	this._callback = callback;
+	//this._captchaCallback = captchaCallback;
 	
 	var cacheFile = Zotero.getZoteroDirectory();
-	cacheFile.append("recognizePDFcache.txt");
+	cacheFile.append(".zotero-recpdf-cache");
+	if(cacheFile.exists()) {
+		cacheFile.remove(false);
+	}
 	
 	Zotero.debug('Running pdftotext -enc UTF-8 -nopgbrk '
 				+ '-l ' + MAX_PAGES + ' "' + file.path + '" "'
@@ -113,10 +258,15 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, ca
 	exec.append(Zotero.Fulltext.pdfConverterFileName);
 	proc.init(exec);
 	
-	var args = ['-enc', 'UTF-8', '-nopgbrk', '-raw', '-l', MAX_PAGES];
+	var args = ['-enc', 'UTF-8', '-nopgbrk', '-layout', '-l', MAX_PAGES];
 	args.push(file.path, cacheFile.path);
 	proc.run(true, args, args.length);
 	
+	if(!cacheFile.exists()) {
+		this._callback(false, "recognizePDF.couldNotRead");
+		return;
+	}
+	
 	var inputStream = Components.classes["@mozilla.org/network/file-input-stream;1"]
 		.createInstance(Components.interfaces.nsIFileInputStream);
 	inputStream.init(cacheFile, 0x01, 0664, 0);
@@ -131,31 +281,34 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, ca
 	var lineLengths = [];
 	var str = {};
 	while(intlStream.readLine(str)) {
-		if(!whitespaceRe.test(str.value)) {
-			lines.push(str.value);
-			lineLengths.push(str.value.length);
+		var line = lineRe.exec(str.value);
+		if(line) {
+			lines.push(line[1]);
+			lineLengths.push(line[1].length);
 		}
 	}
 	
 	// get (not quite) median length
 	var lineLengthsLength = lineLengths.length;
 	if(lineLengthsLength < 20) {
-		this._error();
+		this._callback(false, "recognizePDF.noOCR");
 		return;
 	}
 	
 	var sortedLengths = lineLengths.sort();
 	var medianLength = sortedLengths[Math.floor(lineLengthsLength/2)];
 	
-	// pick lines within 4 chars of the median
+	// pick lines within 4 chars of the median (this is completely arbitrary)
 	this._goodLines = [];
 	var uBound = medianLength + 4;
 	var lBound = medianLength - 4;
 	for (var i=0; i<lineLengthsLength; i++) {
-		if(lineLengths[i] >= lBound && lineLengths[i] <= uBound) this._goodLines.push(lines[i]);
+		if(lineLengths[i] > lBound && lineLengths[i] < uBound) this._goodLines.push(lines[i]);
 	}
 	
 	this._startLine = this._iteration = 0;
+	
+	cacheFile.remove(false);
 	this._queryGoogle();
 }
 
@@ -165,18 +318,32 @@ Zotero_RecognizePDF.Recognizer.prototype.recognize = function(file, pdfTitle, ca
  */
 Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
 	if(this._iteration > 3 || this._startLine >= this._goodLines.length) {
-		this._error();
+		try {
+			if(this._hiddenBrowser) Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
+		} catch(e) {}
+		this._callback(false, "recognizePDF.noMatches");
 		return;
 	}
+	this._iteration++;
 	
 	// take the relevant parts of some lines (exclude hyphenated word)
 	var queryStringWords = 0;
 	var queryString = "";
 	while(queryStringWords < 25 && this._startLine < this._goodLines.length) {
 		var words = this._goodLines[this._startLine].split(/\s+/);
+		// get rid of first and last words
 		words.shift();
 		words.pop();
-		if(words.length) {
+		// make sure there are no long words (probably OCR mistakes)
+		var skipLine = false;
+		for(var i=0; i<words.length; i++) {
+			if(words[i].length > 20) {
+				skipLine = true;
+				break;
+			}
+		}
+		// add words to query
+		if(!skipLine && words.length) {
 			queryStringWords += words.length;
 			queryString += '"'+words.join(" ")+'" ';
 		}
@@ -185,27 +352,61 @@ Zotero_RecognizePDF.Recognizer.prototype._queryGoogle = function() {
 	Zotero.debug("RecognizePDF: Query string "+queryString);
 	
 	// pass query string to Google Scholar and translate
-	var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString);
-	this.hiddenBrowser = Zotero.Browser.createHiddenBrowser();
+	var url = "http://scholar.google.com/scholar?q="+encodeURIComponent(queryString)+"&hl=en&lr=&btnG=Search";
+	if(!this._hiddenBrowser) {
+		this._hiddenBrowser = Zotero.Browser.createHiddenBrowser();
+		this._hiddenBrowser.docShell.allowImages = false;
+	}
 	
 	var me = this;
 	var translate = new Zotero.Translate("web", true, false);
 	translate.setTranslator("57a00950-f0d1-4b41-b6ba-44ff0fc30289");
-	translate.setHandler("itemDone", this._callback);
+	translate.setHandler("itemDone", function(translate, item) {
+		Zotero.Browser.deleteHiddenBrowser(me._hiddenBrowser);
+		me._callback(item);
+	});
 	translate.setHandler("select", function(translate, items) { return me._selectItems(translate, items) });
-	translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle() });
+	translate.setHandler("done", function(translate, success) { if(!success) me._queryGoogle(); });
+	
+	this._hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true);
+	
+	// to make us a little less obvious, specify a referrer
+	var referrer = Components.classes["@mozilla.org/network/io-service;1"]
+		.getService(Components.interfaces.nsIIOService)
+		.newURI(this._previousURL ? this._previousURL : "http://scholar.google.com/", null, null);
+	this._hiddenBrowser.loadURIWithFlags(url,
+		Components.interfaces.nsIWebNavigation.LOAD_FLAGS_BYPASS_HISTORY, referrer, null, null);
 	
-	this.hiddenBrowser.addEventListener("pageshow", function() { me._scrape(translate) }, true);
-	this.hiddenBrowser.loadURI(url);
+	this._previousURL = url;
 }
 
 /**
- * Callback to be executed when Google Scholar is loaded
+ * To be executed when Google Scholar is loaded
  * @private
  */
 Zotero_RecognizePDF.Recognizer.prototype._scrape = function(/**Zotero.Translate*/ translate) {
-	this.hiddenBrowser.removeEventListener("pageshow", this._scrape.caller, true);
-	translate.setDocument(this.hiddenBrowser.contentDocument);
+	if(this._hiddenBrowser.contentDocument.title == "403 Forbidden") {
+		// hit the captcha
+		/*
+		var forms = this._hiddenBrowser.contentDocument.getElementsByTagName("form");
+		if(forms.length && forms[0].getAttribute("action") == "Captcha") {
+			var captchaImage = forms[0].getElementsByTagName("img");
+			var captchaBox = this._hiddenBrowser.contentDocument.getElementsByName("captcha");
+			if(captchaImage.length && captchaBox.length && this._captchaCallback) {
+				var text = this._captchaCallback(captchaImage[0].src);
+				if(text) {
+					captchaBox[0].value = text;
+					forms[0].submit();
+					return;
+				}
+			}
+		}*/
+		this._callback(false, "recognizePDF.limit");
+		return;
+	}
+	
+	this._hiddenBrowser.removeEventListener("pageshow", this._scrape.caller, true);
+	translate.setDocument(this._hiddenBrowser.contentDocument);
 	translate.translate();
 }
 
@@ -220,16 +421,4 @@ Zotero_RecognizePDF.Recognizer.prototype._selectItems = function(/**Zotero.Trans
 		obj[i] = items;
 		return obj;
 	}
-}
-
-/**
- * Displays an error when a PDF cannot be recognized
- * @private
- */
-Zotero_RecognizePDF.Recognizer.prototype._error = function() {
-	var promptService = Components.classes["@mozilla.org/embedcomp/prompt-service;1"]
-									.getService(Components.interfaces.nsIPromptService);
-	promptService.alert(window,
-		Zotero.getString('recognizePDF.couldNotRecognize.title'),
-		Zotero.getString('recognizePDF.couldNotRecognize.message', this._pdfTitle));
 }
 \ No newline at end of file
diff --git a/chrome/locale/en-US/zotero/zotero.dtd b/chrome/locale/en-US/zotero/zotero.dtd
@@ -164,3 +164,9 @@
 <!ENTITY zotero.proxy.recognized.warning.secondary		"Adding other proxies allows malicious sites to masquerade as sites you trust."> 
 <!ENTITY zotero.proxy.recognized.disable.label			"Do not automatically redirect requests through previously recognized proxies">
 <!ENTITY zotero.proxy.recognized.ignore.label			"Ignore">
+
+<!ENTITY zotero.recognizePDF.recognizing.label			"Retrieving Metadata...">
+<!ENTITY zotero.recognizePDF.cancel.label					"Cancel">
+<!ENTITY zotero.recognizePDF.pdfName.label				"PDF Name">
+<!ENTITY zotero.recognizePDF.itemName.label				"Item Name">
+<!ENTITY zotero.recognizePDF.captcha.label				"Type the text below to continue retrieving metadata.">
+\ No newline at end of file
diff --git a/chrome/locale/en-US/zotero/zotero.properties b/chrome/locale/en-US/zotero/zotero.properties
@@ -516,5 +516,10 @@ proxies.recognized.add				= Add Proxy
 proxies.enableTransparentWarning.title			= Warning
 proxies.enableTransparentWarning.description	= Please ensure that the proxies listed below belong to a library, school, or other institution with which you are affiliated. A malicious proxy could pose a security risk.
 
-recognizePDF.couldNotRecognize.title	= Could Not Retrieve Metada
-recognizePDF.couldNotRecognize.message	= Zotero could not retrieve metadata for "%1$S".
+recognizePDF.noOCR					= PDF does not contain OCRed text.
+recognizePDF.couldNotRead			= Could not read text from PDF.
+recognizePDF.noMatches				= No matching references found.
+recognizePDF.fileNotFound			= File not found.
+recognizePDF.limit					= Query limit reached. Try again later.
+recognizePDF.complete.label			= Metadata Retrieval Complete.
+recognizePDF.close.label			= Close
+\ No newline at end of file
diff --git a/chrome/skin/default/zotero/cross.png b/chrome/skin/default/zotero/cross.png
Binary files differ.
diff --git a/chrome/skin/default/zotero/indicator.gif b/chrome/skin/default/zotero/indicator.gif
Binary files differ.

	www Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| Submodules \| README \| LICENSE

A	chrome/content/zotero/pdfProgress.xul	\|	23	+++++++++++++++++++++++
M	chrome/content/zotero/recognizePDF.js	\|	323	++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
M	chrome/locale/en-US/zotero/zotero.dtd	\|	7	+++++++
M	chrome/locale/en-US/zotero/zotero.properties	\|	10	++++++++--
A	chrome/skin/default/zotero/cross.png	\|	0
A	chrome/skin/default/zotero/indicator.gif	\|	0