www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

recognizePDF.js (17070B)


      1 /*
      2     ***** BEGIN LICENSE BLOCK *****
      3     
      4     Copyright © 2018 Center for History and New Media
      5                      George Mason University, Fairfax, Virginia, USA
      6                      http://zotero.org
      7     
      8     This file is part of Zotero.
      9     
     10     Zotero is free software: you can redistribute it and/or modify
     11     it under the terms of the GNU Affero General Public License as published by
     12     the Free Software Foundation, either version 3 of the License, or
     13     (at your option) any later version.
     14     
     15     Zotero is distributed in the hope that it will be useful,
     16     but WITHOUT ANY WARRANTY; without even the implied warranty of
     17     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     18     GNU Affero General Public License for more details.
     19     
     20     You should have received a copy of the GNU Affero General Public License
     21     along with Zotero.  If not, see <http://www.gnu.org/licenses/>.
     22     
     23     ***** END LICENSE BLOCK *****
     24 */
     25 
     26 Zotero.RecognizePDF = new function () {
     27 	const OFFLINE_RECHECK_DELAY = 60 * 1000;
     28 	const MAX_PAGES = 5;
     29 	const UNRECOGNIZE_TIMEOUT = 86400 * 1000;
     30 	
     31 	this.ROW_QUEUED = 1;
     32 	this.ROW_PROCESSING = 2;
     33 	this.ROW_FAILED = 3;
     34 	this.ROW_SUCCEEDED = 4;
     35 	
     36 	let _newItems = new WeakMap();
     37 	
     38 	let _listeners = {};
     39 	let _rows = [];
     40 	let _queue = [];
     41 	let _queueProcessing = false;
     42 	
     43 	/**
     44 	 * Add listener
     45 	 * @param name Event name
     46 	 * @param callback
     47 	 */
     48 	this.addListener = function (name, callback) {
     49 		_listeners[name] = callback;
     50 	};
     51 	
     52 	/**
     53 	 * Remove listener
     54 	 * @param name Event name
     55 	 */
     56 	this.removeListener = function (name) {
     57 		delete _listeners[name];
     58 	};
     59 	
     60 	/**
     61 	 * Checks whether a given PDF could theoretically be recognized
     62 	 * @param {Zotero.Item} item
     63 	 * @return {Boolean} True if the PDF can be recognized, false if it cannot be
     64 	 */
     65 	this.canRecognize = function (item) {
     66 		return item.attachmentContentType
     67 			&& item.attachmentContentType === 'application/pdf'
     68 			&& item.isTopLevelItem();
     69 	};
     70 	
     71 	/**
     72 	 * Adds items to the queue and starts processing it
     73 	 * @param items {Zotero.Item}
     74 	 */
     75 	this.recognizeItems = function (items) {
     76 		for (let item of items) {
     77 			_addItem(item);
     78 		}
     79 		_processQueue();
     80 	};
     81 	
     82 	
     83 	this.autoRecognizeItems = function (items) {
     84 		if (!Zotero.Prefs.get('autoRecognizeFiles')) return;
     85 		
     86 		var pdfs = items.filter((item) => {
     87 			return item
     88 				&& item.isFileAttachment()
     89 				&& item.attachmentContentType == 'application/pdf';
     90 		});
     91 		if (!pdfs.length) {
     92 			return;
     93 		}
     94 		this.recognizeItems(pdfs);
     95 		var win = Services.wm.getMostRecentWindow("navigator:browser");
     96 		if (win) {
     97 			win.Zotero_RecognizePDF_Dialog.open();
     98 		}
     99 	};
    100 	
    101 	/**
    102 	 * Returns all rows
    103 	 * @return {Array}
    104 	 */
    105 	this.getRows = function () {
    106 		return _rows;
    107 	};
    108 	
    109 	/**
    110 	 * Returns rows count
    111 	 * @return {Number}
    112 	 */
    113 	this.getTotal = function () {
    114 		return _rows.length;
    115 	};
    116 	
    117 	/**
    118 	 * Returns processed rows count
    119 	 * @return {Number}
    120 	 */
    121 	this.getProcessedTotal = function () {
    122 		return _rows.filter(row => row.status > Zotero.RecognizePDF.ROW_PROCESSING).length;
    123 	};
    124 	
    125 	/**
    126 	 * Stop processing items
    127 	 */
    128 	this.cancel = function () {
    129 		_queue = [];
    130 		_rows = [];
    131 		if (_listeners['empty']) {
    132 			_listeners['empty']();
    133 		}
    134 	};
    135 	
    136 	
    137 	this.canUnrecognize = function (item) {
    138 		var { dateModified } = _newItems.get(item) || {};
    139 		// Item must have been recognized recently, must not have been modified since it was
    140 		// created, and must have only one attachment and no other children
    141 		if (!dateModified
    142 				|| Zotero.Date.sqlToDate(dateModified, true) < new Date() - UNRECOGNIZE_TIMEOUT
    143 				|| item.dateModified != dateModified
    144 				|| item.numAttachments(true) != 1
    145 				|| item.numChildren(true) != 1) {
    146 			_newItems.delete(item);
    147 			return false;
    148 		}
    149 		
    150 		// Child attachment must be not be in trash and must be a PDF
    151 		var attachments = Zotero.Items.get(item.getAttachments());
    152 		if (!attachments.length || attachments[0].attachmentContentType != 'application/pdf') {
    153 			_newItems.delete(item);
    154 			return false;
    155 		}
    156 		
    157 		return true;
    158 	};
    159 	
    160 	
    161 	this.unrecognize = async function (item) {
    162 		var { originalTitle, originalFilename } = _newItems.get(item);
    163 		var attachment = Zotero.Items.get(item.getAttachments()[0]);
    164 		
    165 		try {
    166 			let currentFilename = attachment.attachmentFilename;
    167 			if (currentFilename != originalFilename) {
    168 				let renamed = await attachment.renameAttachmentFile(originalFilename);
    169 				if (renamed) {
    170 					attachment.setField('title', originalTitle);
    171 				}
    172 			}
    173 		}
    174 		catch (e) {
    175 			Zotero.logError(e);
    176 		}
    177 		
    178 		return Zotero.DB.executeTransaction(async function () {
    179 			let collections = item.getCollections();
    180 			attachment.parentItemID = null
    181 			attachment.setCollections(collections);
    182 			await attachment.save();
    183 			
    184 			await item.erase();
    185 		}.bind(this));
    186 	};
    187 	
    188 	
    189 	this.report = async function (item, description) {
    190 		var attachment = Zotero.Items.get(item.getAttachments()[0]);
    191 		var filePath = attachment.getFilePath();
    192 		if (!filePath || !await OS.File.exists(filePath)) {
    193 			throw new Error("File not found when reporting metadata");
    194 		}
    195 		
    196 		var version = Zotero.version;
    197 		var json = await extractJSON(filePath, MAX_PAGES);
    198 		var metadata = item.toJSON();
    199 		
    200 		var data = { description, version, json, metadata };
    201 		var uri = ZOTERO_CONFIG.RECOGNIZE_URL + 'report';
    202 		return Zotero.HTTP.request(
    203 			"POST",
    204 			uri,
    205 			{
    206 				successCodes: [200, 204],
    207 				headers: {
    208 					'Content-Type': 'application/json'
    209 				},
    210 				body: JSON.stringify(data)
    211 			}
    212 		);
    213 	};
    214 	
    215 	
    216 	/**
    217 	 * Add item for processing
    218 	 * @param item
    219 	 * @return {null}
    220 	 */
    221 	function _addItem(item) {
    222 		for (let row of _rows) {
    223 			if (row.id === item.id) {
    224 				if (row.status > Zotero.RecognizePDF.ROW_PROCESSING) {
    225 					_deleteRow(row.id);
    226 					break;
    227 				}
    228 				return null;
    229 			}
    230 		}
    231 		
    232 		let row = {
    233 			id: item.id,
    234 			status: Zotero.RecognizePDF.ROW_QUEUED,
    235 			fileName: item.getField('title'),
    236 			message: ''
    237 		};
    238 		
    239 		_rows.unshift(row);
    240 		_queue.unshift(item.id);
    241 		
    242 		if (_listeners['rowadded']) {
    243 			_listeners['rowadded'](row);
    244 		}
    245 		
    246 		if (_listeners['nonempty'] && _rows.length === 1) {
    247 			_listeners['nonempty']();
    248 		}
    249 	}
    250 	
    251 	/**
    252 	 * Update row status and message
    253 	 * @param itemID
    254 	 * @param status
    255 	 * @param message
    256 	 */
    257 	function _updateRow(itemID, status, message) {
    258 		for (let row of _rows) {
    259 			if (row.id === itemID) {
    260 				row.status = status;
    261 				row.message = message;
    262 				if (_listeners['rowupdated']) {
    263 					_listeners['rowupdated']({
    264 						id: row.id,
    265 						status,
    266 						message: message || ''
    267 					});
    268 				}
    269 				return;
    270 			}
    271 		}
    272 	}
    273 	
    274 	/**
    275 	 * Delete row
    276 	 * @param itemID
    277 	 */
    278 	function _deleteRow(itemID) {
    279 		for (let i = 0; i < _rows.length; i++) {
    280 			let row = _rows[i];
    281 			if (row.id === itemID) {
    282 				_rows.splice(i, 1);
    283 				if (_listeners['rowdeleted']) {
    284 					_listeners['rowdeleted']({
    285 						id: row.id
    286 					});
    287 				}
    288 				return;
    289 			}
    290 		}
    291 	}
    292 	
    293 	/**
    294 	 * Triggers queue processing and returns when all items in the queue are processed
    295 	 * @return {Promise}
    296 	 */
    297 	async function _processQueue() {
    298 		await Zotero.Schema.schemaUpdatePromise;
    299 		
    300 		if (_queueProcessing) return;
    301 		_queueProcessing = true;
    302 		
    303 		while (1) {
    304 			if (Zotero.HTTP.browserIsOffline()) {
    305 				await Zotero.Promise.delay(OFFLINE_RECHECK_DELAY);
    306 				continue;
    307 			}
    308 			
    309 			let itemID = _queue.shift();
    310 			if (!itemID) break;
    311 			
    312 			_updateRow(itemID, Zotero.RecognizePDF.ROW_PROCESSING, Zotero.getString('general.processing'));
    313 			
    314 			try {
    315 				let newItem = await _processItem(itemID);
    316 				
    317 				if (newItem) {
    318 					_updateRow(itemID, Zotero.RecognizePDF.ROW_SUCCEEDED, newItem.getField('title'));
    319 				}
    320 				else {
    321 					_updateRow(itemID, Zotero.RecognizePDF.ROW_FAILED, Zotero.getString('recognizePDF.noMatches'));
    322 				}
    323 			}
    324 			catch (e) {
    325 				Zotero.logError(e);
    326 				
    327 				_updateRow(
    328 					itemID,
    329 					Zotero.RecognizePDF.ROW_FAILED,
    330 					e instanceof Zotero.Exception.Alert
    331 						? e.message
    332 						: Zotero.getString('recognizePDF.error')
    333 				);
    334 			}
    335 		}
    336 		
    337 		_queueProcessing = false;
    338 	}
    339 	
    340 	/**
    341 	 * Processes the item and places it as a children of the new item
    342 	 * @param itemID
    343 	 * @return {Promise}
    344 	 */
    345 	async function _processItem(itemID) {
    346 		let attachment = await Zotero.Items.getAsync(itemID);
    347 		
    348 		if (!attachment || attachment.parentItemID) {
    349 			throw new Zotero.Exception.Alert('recognizePDF.error');
    350 		}
    351 		
    352 		var zp = Zotero.getActiveZoteroPane();
    353 		var selectParent = false;
    354 		if (zp) {
    355 			let selected = zp.getSelectedItems();
    356 			if (selected.length) {
    357 				// If only the PDF was selected, select the parent when we're done
    358 				selectParent = selected.length == 1 && selected[0] == attachment;
    359 			}
    360 		}
    361 		
    362 		let parentItem = await _recognize(attachment);
    363 		if (!parentItem) {
    364 			return null;
    365 		}
    366 		
    367 		// Put new item in same collections as the old one
    368 		let collections = attachment.getCollections();
    369 		await Zotero.DB.executeTransaction(async function () {
    370 			if (collections.length) {
    371 				for (let collectionID of collections) {
    372 					parentItem.addToCollection(collectionID);
    373 				}
    374 				await parentItem.save();
    375 			}
    376 			
    377 			// Put old item as a child of the new item
    378 			attachment.parentID = parentItem.id;
    379 			await attachment.save();
    380 		});
    381 		
    382 		var originalTitle = attachment.getField('title');
    383 		var path = attachment.getFilePath();
    384 		var originalFilename = OS.Path.basename(path);
    385 		
    386 		// Rename attachment file to match new metadata
    387 		if (Zotero.Prefs.get('autoRenameFiles')) {
    388 			let ext = Zotero.File.getExtension(path);
    389 			let fileBaseName = Zotero.Attachments.getFileBaseNameFromItem(parentItem);
    390 			let newName = fileBaseName + (ext ? '.' + ext : '');
    391 			let result = await attachment.renameAttachmentFile(newName, false, true);
    392 			if (result !== true) {
    393 				throw new Error("Error renaming " + path);
    394 			}
    395 			// Rename attachment title
    396 			attachment.setField('title', newName);
    397 			await attachment.saveTx();
    398 		}
    399 		
    400 		try {
    401 			zp = Zotero.getActiveZoteroPane();
    402 			if (zp) {
    403 				if (selectParent) {
    404 					await zp.selectItem(parentItem.id);
    405 				}
    406 			}
    407 		}
    408 		catch (e) {
    409 			Zotero.logError(e);
    410 		}
    411 		
    412 		_newItems.set(
    413 			parentItem,
    414 			{
    415 				originalTitle,
    416 				originalFilename,
    417 				dateModified: parentItem.dateModified
    418 			}
    419 		);
    420 		return parentItem;
    421 	}
    422 	
    423 	/**
    424 	 * Get json from a PDF
    425 	 * @param {String} filePath PDF file path
    426 	 * @param {Number} pages Number of pages to extract
    427 	 * @return {Promise}
    428 	 */
    429 	async function extractJSON(filePath, pages) {
    430 		let cacheFile = Zotero.getTempDirectory();
    431 		cacheFile.append("recognizePDFcache.txt");
    432 		if (cacheFile.exists()) {
    433 			cacheFile.remove(false);
    434 		}
    435 		
    436 		let {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs();
    437 		args.push('-json', '-l', pages, filePath, cacheFile.path);
    438 		
    439 		Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" "));
    440 		
    441 		try {
    442 			await Zotero.Utilities.Internal.exec(exec, args);
    443 			let content = await Zotero.File.getContentsAsync(cacheFile.path);
    444 			Zotero.debug("RecognizePDF: Extracted JSON:");
    445 			Zotero.debug(content);
    446 			cacheFile.remove(false);
    447 			return JSON.parse(content);
    448 		}
    449 		catch (e) {
    450 			Zotero.logError(e);
    451 			try {
    452 				cacheFile.remove(false);
    453 			} catch(e) {
    454 				Zotero.logError(e);
    455 			}
    456 			throw new Zotero.Exception.Alert("recognizePDF.couldNotRead");
    457 		}
    458 	}
    459 	
    460 	/**
    461 	 * Attach appropriate handlers to a Zotero.Translate instance and begin translation
    462 	 * @return {Promise}
    463 	 */
    464 	async function _promiseTranslate(translate, libraryID) {
    465 		translate.setHandler('select', function (translate, items, callback) {
    466 			for (let i in items) {
    467 				let obj = {};
    468 				obj[i] = items[i];
    469 				callback(obj);
    470 				return;
    471 			}
    472 		});
    473 		
    474 		let newItems = await translate.translate({
    475 			libraryID,
    476 			saveAttachments: false
    477 		});
    478 		if (newItems.length) {
    479 			return newItems[0];
    480 		}
    481 		throw new Error('No items found');
    482 	}
    483 	
    484 	async function _query(json) {
    485 		// TODO: Use main API URL for recognizer server
    486 		//let uri = Zotero.Prefs.get("api.url") || ZOTERO_CONFIG.API_URL;
    487 		let uri = Zotero.Prefs.get("api.url") || ZOTERO_CONFIG.RECOGNIZE_URL;
    488 		
    489 		if (!uri.endsWith('/')) {
    490 			uri += '/';
    491 		}
    492 		
    493 		uri += 'recognize';
    494 		
    495 		let client = Zotero.Sync.Runner.getAPIClient();
    496 		
    497 		let req = await client.makeRequest(
    498 			'POST',
    499 			uri,
    500 			{
    501 				successCodes: [200],
    502 				headers: {
    503 					'Content-Type': 'application/json'
    504 				},
    505 				body: JSON.stringify(json),
    506 				noAPIKey: true
    507 			}
    508 		);
    509 		
    510 		return JSON.parse(req.responseText);
    511 	}
    512 	
    513 	/**
    514 	 * Retrieves metadata for a PDF and saves it as an item
    515 	 * @param {Zotero.Item} item
    516 	 * @return {Promise}
    517 	 */
    518 	async function _recognize(item) {
    519 		let filePath = await item.getFilePath();
    520 		
    521 		if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound');
    522 
    523 		let json = await extractJSON(filePath, MAX_PAGES);
    524 		
    525 		let containingTextPages = 0;
    526 		
    527 		for(let page of json.pages) {
    528 			if(page[2].length) {
    529 				containingTextPages++;
    530 			}
    531 		}
    532 		
    533 		if(!containingTextPages) {
    534 			throw new Zotero.Exception.Alert('recognizePDF.noOCR');
    535 		}
    536 		
    537 		let libraryID = item.libraryID;
    538 		
    539 		let res = await _query(json);
    540 		if (!res) return null;
    541 		
    542 		if (res.arxiv) {
    543 			Zotero.debug('RecognizePDF: Getting metadata by arXiv');
    544 			let translate = new Zotero.Translate.Search();
    545 			translate.setIdentifier({arXiv: res.arxiv});
    546 			let translators = await translate.getTranslators();
    547 			translate.setTranslator(translators);
    548 			
    549 			try {
    550 				let newItem = await _promiseTranslate(translate, libraryID);
    551 				if (!newItem.abstractNote && res.abstract) {
    552 					newItem.setField('abstractNote', res.abstract);
    553 				}
    554 				if (!newItem.language && res.language) {
    555 					newItem.setField('language', res.language);
    556 				}
    557 				newItem.saveTx();
    558 				return newItem;
    559 			}
    560 			catch (e) {
    561 				Zotero.debug('RecognizePDF: ' + e);
    562 			}
    563 		}
    564 		
    565 		if (res.doi) {
    566 			Zotero.debug('RecognizePDF: Getting metadata by DOI');
    567 			let translate = new Zotero.Translate.Search();
    568 			translate.setIdentifier({
    569 				DOI: res.doi
    570 			});
    571 			let translators = await translate.getTranslators();
    572 			translate.setTranslator(translators);
    573 			
    574 			try {
    575 				let newItem = await _promiseTranslate(translate, libraryID);
    576 				if (!newItem.abstractNote && res.abstract) {
    577 					newItem.setField('abstractNote', res.abstract);
    578 				}
    579 				if (!newItem.language && res.language) {
    580 					newItem.setField('language', res.language);
    581 				}
    582 				newItem.saveTx();
    583 				return newItem;
    584 			}
    585 			catch (e) {
    586 				Zotero.debug('RecognizePDF: ' + e);
    587 			}
    588 		}
    589 		
    590 		if (res.isbn) {
    591 			Zotero.debug('RecognizePDF: Getting metadata by ISBN');
    592 			let translate = new Zotero.Translate.Search();
    593 			translate.setSearch({'itemType': 'book', 'ISBN': res.isbn});
    594 			try {
    595 				let translatedItems = await translate.translate({
    596 					libraryID: false,
    597 					saveAttachments: false
    598 				});
    599 				Zotero.debug('RecognizePDF: Translated items:');
    600 				Zotero.debug(translatedItems);
    601 				if (translatedItems.length) {
    602 					let newItem = new Zotero.Item;
    603 					newItem.libraryID = libraryID;
    604 					// Convert tags to automatic. For other items this is done automatically in
    605 					// translate.js for other items, but for ISBNs we just get the data
    606 					// (libraryID=false) and do the saving manually.
    607 					translatedItems[0].tags = translatedItems[0].tags.map(tag => {
    608 						if (typeof tag == 'string') {
    609 							return {
    610 								tag,
    611 								type: 1
    612 							};
    613 						}
    614 						tag.type = 1;
    615 						return tag;
    616 					});
    617 					newItem.fromJSON(translatedItems[0]);
    618 					if (!newItem.abstractNote && res.abstract) {
    619 						newItem.setField('abstractNote', res.abstract);
    620 					}
    621 					if (!newItem.language && res.language) {
    622 						newItem.setField('language', res.language);
    623 					}
    624 					newItem.saveTx();
    625 					return newItem;
    626 				}
    627 			}
    628 			catch (e) {
    629 				Zotero.debug('RecognizePDF: ' + e);
    630 			}
    631 		}
    632 		
    633 		if (res.title) {
    634 			let type = 'journalArticle';
    635 			
    636 			if (res.type === 'book-chapter') {
    637 				type = 'bookSection';
    638 			}
    639 			
    640 			let newItem = new Zotero.Item(type);
    641 			newItem.libraryID = libraryID;
    642 			newItem.setField('title', res.title);
    643 			
    644 			let creators = [];
    645 			for (let author of res.authors) {
    646 				creators.push({
    647 					firstName: author.firstName,
    648 					lastName: author.lastName,
    649 					creatorType: 'author'
    650 				})
    651 			}
    652 			
    653 			newItem.setCreators(creators);
    654 			
    655 			if (res.abstract) newItem.setField('abstractNote', res.abstract);
    656 			if (res.year) newItem.setField('date', res.year);
    657 			if (res.pages) newItem.setField('pages', res.pages);
    658 			if (res.volume) newItem.setField('volume', res.volume);
    659 			if (res.url) newItem.setField('url', res.url);
    660 			if (res.language) newItem.setField('language', res.language);
    661 			
    662 			if (type === 'journalArticle') {
    663 				if (res.issue) newItem.setField('issue', res.issue);
    664 				if (res.ISSN) newItem.setField('issn', res.issn);
    665 				if (res.container) newItem.setField('publicationTitle', res.container);
    666 			}
    667 			else if (type === 'bookSection') {
    668 				if (res.container) newItem.setField('bookTitle', res.container);
    669 				if (res.publisher) newItem.setField('publisher', res.publisher);
    670 			}
    671 			
    672 			newItem.setField('libraryCatalog', 'Zotero');
    673 			
    674 			await newItem.saveTx();
    675 			return newItem;
    676 		}
    677 		
    678 		return null;
    679 	}
    680 };
    681