recognizePDF.js (17070B)
1 /* 2 ***** BEGIN LICENSE BLOCK ***** 3 4 Copyright © 2018 Center for History and New Media 5 George Mason University, Fairfax, Virginia, USA 6 http://zotero.org 7 8 This file is part of Zotero. 9 10 Zotero is free software: you can redistribute it and/or modify 11 it under the terms of the GNU Affero General Public License as published by 12 the Free Software Foundation, either version 3 of the License, or 13 (at your option) any later version. 14 15 Zotero is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU Affero General Public License for more details. 19 20 You should have received a copy of the GNU Affero General Public License 21 along with Zotero. If not, see <http://www.gnu.org/licenses/>. 22 23 ***** END LICENSE BLOCK ***** 24 */ 25 26 Zotero.RecognizePDF = new function () { 27 const OFFLINE_RECHECK_DELAY = 60 * 1000; 28 const MAX_PAGES = 5; 29 const UNRECOGNIZE_TIMEOUT = 86400 * 1000; 30 31 this.ROW_QUEUED = 1; 32 this.ROW_PROCESSING = 2; 33 this.ROW_FAILED = 3; 34 this.ROW_SUCCEEDED = 4; 35 36 let _newItems = new WeakMap(); 37 38 let _listeners = {}; 39 let _rows = []; 40 let _queue = []; 41 let _queueProcessing = false; 42 43 /** 44 * Add listener 45 * @param name Event name 46 * @param callback 47 */ 48 this.addListener = function (name, callback) { 49 _listeners[name] = callback; 50 }; 51 52 /** 53 * Remove listener 54 * @param name Event name 55 */ 56 this.removeListener = function (name) { 57 delete _listeners[name]; 58 }; 59 60 /** 61 * Checks whether a given PDF could theoretically be recognized 62 * @param {Zotero.Item} item 63 * @return {Boolean} True if the PDF can be recognized, false if it cannot be 64 */ 65 this.canRecognize = function (item) { 66 return item.attachmentContentType 67 && item.attachmentContentType === 'application/pdf' 68 && item.isTopLevelItem(); 69 }; 70 71 /** 72 * Adds items to the queue and starts processing it 73 * @param items {Zotero.Item} 74 */ 75 this.recognizeItems = function (items) { 76 for (let item of items) { 77 _addItem(item); 78 } 79 _processQueue(); 80 }; 81 82 83 this.autoRecognizeItems = function (items) { 84 if (!Zotero.Prefs.get('autoRecognizeFiles')) return; 85 86 var pdfs = items.filter((item) => { 87 return item 88 && item.isFileAttachment() 89 && item.attachmentContentType == 'application/pdf'; 90 }); 91 if (!pdfs.length) { 92 return; 93 } 94 this.recognizeItems(pdfs); 95 var win = Services.wm.getMostRecentWindow("navigator:browser"); 96 if (win) { 97 win.Zotero_RecognizePDF_Dialog.open(); 98 } 99 }; 100 101 /** 102 * Returns all rows 103 * @return {Array} 104 */ 105 this.getRows = function () { 106 return _rows; 107 }; 108 109 /** 110 * Returns rows count 111 * @return {Number} 112 */ 113 this.getTotal = function () { 114 return _rows.length; 115 }; 116 117 /** 118 * Returns processed rows count 119 * @return {Number} 120 */ 121 this.getProcessedTotal = function () { 122 return _rows.filter(row => row.status > Zotero.RecognizePDF.ROW_PROCESSING).length; 123 }; 124 125 /** 126 * Stop processing items 127 */ 128 this.cancel = function () { 129 _queue = []; 130 _rows = []; 131 if (_listeners['empty']) { 132 _listeners['empty'](); 133 } 134 }; 135 136 137 this.canUnrecognize = function (item) { 138 var { dateModified } = _newItems.get(item) || {}; 139 // Item must have been recognized recently, must not have been modified since it was 140 // created, and must have only one attachment and no other children 141 if (!dateModified 142 || Zotero.Date.sqlToDate(dateModified, true) < new Date() - UNRECOGNIZE_TIMEOUT 143 || item.dateModified != dateModified 144 || item.numAttachments(true) != 1 145 || item.numChildren(true) != 1) { 146 _newItems.delete(item); 147 return false; 148 } 149 150 // Child attachment must be not be in trash and must be a PDF 151 var attachments = Zotero.Items.get(item.getAttachments()); 152 if (!attachments.length || attachments[0].attachmentContentType != 'application/pdf') { 153 _newItems.delete(item); 154 return false; 155 } 156 157 return true; 158 }; 159 160 161 this.unrecognize = async function (item) { 162 var { originalTitle, originalFilename } = _newItems.get(item); 163 var attachment = Zotero.Items.get(item.getAttachments()[0]); 164 165 try { 166 let currentFilename = attachment.attachmentFilename; 167 if (currentFilename != originalFilename) { 168 let renamed = await attachment.renameAttachmentFile(originalFilename); 169 if (renamed) { 170 attachment.setField('title', originalTitle); 171 } 172 } 173 } 174 catch (e) { 175 Zotero.logError(e); 176 } 177 178 return Zotero.DB.executeTransaction(async function () { 179 let collections = item.getCollections(); 180 attachment.parentItemID = null 181 attachment.setCollections(collections); 182 await attachment.save(); 183 184 await item.erase(); 185 }.bind(this)); 186 }; 187 188 189 this.report = async function (item, description) { 190 var attachment = Zotero.Items.get(item.getAttachments()[0]); 191 var filePath = attachment.getFilePath(); 192 if (!filePath || !await OS.File.exists(filePath)) { 193 throw new Error("File not found when reporting metadata"); 194 } 195 196 var version = Zotero.version; 197 var json = await extractJSON(filePath, MAX_PAGES); 198 var metadata = item.toJSON(); 199 200 var data = { description, version, json, metadata }; 201 var uri = ZOTERO_CONFIG.RECOGNIZE_URL + 'report'; 202 return Zotero.HTTP.request( 203 "POST", 204 uri, 205 { 206 successCodes: [200, 204], 207 headers: { 208 'Content-Type': 'application/json' 209 }, 210 body: JSON.stringify(data) 211 } 212 ); 213 }; 214 215 216 /** 217 * Add item for processing 218 * @param item 219 * @return {null} 220 */ 221 function _addItem(item) { 222 for (let row of _rows) { 223 if (row.id === item.id) { 224 if (row.status > Zotero.RecognizePDF.ROW_PROCESSING) { 225 _deleteRow(row.id); 226 break; 227 } 228 return null; 229 } 230 } 231 232 let row = { 233 id: item.id, 234 status: Zotero.RecognizePDF.ROW_QUEUED, 235 fileName: item.getField('title'), 236 message: '' 237 }; 238 239 _rows.unshift(row); 240 _queue.unshift(item.id); 241 242 if (_listeners['rowadded']) { 243 _listeners['rowadded'](row); 244 } 245 246 if (_listeners['nonempty'] && _rows.length === 1) { 247 _listeners['nonempty'](); 248 } 249 } 250 251 /** 252 * Update row status and message 253 * @param itemID 254 * @param status 255 * @param message 256 */ 257 function _updateRow(itemID, status, message) { 258 for (let row of _rows) { 259 if (row.id === itemID) { 260 row.status = status; 261 row.message = message; 262 if (_listeners['rowupdated']) { 263 _listeners['rowupdated']({ 264 id: row.id, 265 status, 266 message: message || '' 267 }); 268 } 269 return; 270 } 271 } 272 } 273 274 /** 275 * Delete row 276 * @param itemID 277 */ 278 function _deleteRow(itemID) { 279 for (let i = 0; i < _rows.length; i++) { 280 let row = _rows[i]; 281 if (row.id === itemID) { 282 _rows.splice(i, 1); 283 if (_listeners['rowdeleted']) { 284 _listeners['rowdeleted']({ 285 id: row.id 286 }); 287 } 288 return; 289 } 290 } 291 } 292 293 /** 294 * Triggers queue processing and returns when all items in the queue are processed 295 * @return {Promise} 296 */ 297 async function _processQueue() { 298 await Zotero.Schema.schemaUpdatePromise; 299 300 if (_queueProcessing) return; 301 _queueProcessing = true; 302 303 while (1) { 304 if (Zotero.HTTP.browserIsOffline()) { 305 await Zotero.Promise.delay(OFFLINE_RECHECK_DELAY); 306 continue; 307 } 308 309 let itemID = _queue.shift(); 310 if (!itemID) break; 311 312 _updateRow(itemID, Zotero.RecognizePDF.ROW_PROCESSING, Zotero.getString('general.processing')); 313 314 try { 315 let newItem = await _processItem(itemID); 316 317 if (newItem) { 318 _updateRow(itemID, Zotero.RecognizePDF.ROW_SUCCEEDED, newItem.getField('title')); 319 } 320 else { 321 _updateRow(itemID, Zotero.RecognizePDF.ROW_FAILED, Zotero.getString('recognizePDF.noMatches')); 322 } 323 } 324 catch (e) { 325 Zotero.logError(e); 326 327 _updateRow( 328 itemID, 329 Zotero.RecognizePDF.ROW_FAILED, 330 e instanceof Zotero.Exception.Alert 331 ? e.message 332 : Zotero.getString('recognizePDF.error') 333 ); 334 } 335 } 336 337 _queueProcessing = false; 338 } 339 340 /** 341 * Processes the item and places it as a children of the new item 342 * @param itemID 343 * @return {Promise} 344 */ 345 async function _processItem(itemID) { 346 let attachment = await Zotero.Items.getAsync(itemID); 347 348 if (!attachment || attachment.parentItemID) { 349 throw new Zotero.Exception.Alert('recognizePDF.error'); 350 } 351 352 var zp = Zotero.getActiveZoteroPane(); 353 var selectParent = false; 354 if (zp) { 355 let selected = zp.getSelectedItems(); 356 if (selected.length) { 357 // If only the PDF was selected, select the parent when we're done 358 selectParent = selected.length == 1 && selected[0] == attachment; 359 } 360 } 361 362 let parentItem = await _recognize(attachment); 363 if (!parentItem) { 364 return null; 365 } 366 367 // Put new item in same collections as the old one 368 let collections = attachment.getCollections(); 369 await Zotero.DB.executeTransaction(async function () { 370 if (collections.length) { 371 for (let collectionID of collections) { 372 parentItem.addToCollection(collectionID); 373 } 374 await parentItem.save(); 375 } 376 377 // Put old item as a child of the new item 378 attachment.parentID = parentItem.id; 379 await attachment.save(); 380 }); 381 382 var originalTitle = attachment.getField('title'); 383 var path = attachment.getFilePath(); 384 var originalFilename = OS.Path.basename(path); 385 386 // Rename attachment file to match new metadata 387 if (Zotero.Prefs.get('autoRenameFiles')) { 388 let ext = Zotero.File.getExtension(path); 389 let fileBaseName = Zotero.Attachments.getFileBaseNameFromItem(parentItem); 390 let newName = fileBaseName + (ext ? '.' + ext : ''); 391 let result = await attachment.renameAttachmentFile(newName, false, true); 392 if (result !== true) { 393 throw new Error("Error renaming " + path); 394 } 395 // Rename attachment title 396 attachment.setField('title', newName); 397 await attachment.saveTx(); 398 } 399 400 try { 401 zp = Zotero.getActiveZoteroPane(); 402 if (zp) { 403 if (selectParent) { 404 await zp.selectItem(parentItem.id); 405 } 406 } 407 } 408 catch (e) { 409 Zotero.logError(e); 410 } 411 412 _newItems.set( 413 parentItem, 414 { 415 originalTitle, 416 originalFilename, 417 dateModified: parentItem.dateModified 418 } 419 ); 420 return parentItem; 421 } 422 423 /** 424 * Get json from a PDF 425 * @param {String} filePath PDF file path 426 * @param {Number} pages Number of pages to extract 427 * @return {Promise} 428 */ 429 async function extractJSON(filePath, pages) { 430 let cacheFile = Zotero.getTempDirectory(); 431 cacheFile.append("recognizePDFcache.txt"); 432 if (cacheFile.exists()) { 433 cacheFile.remove(false); 434 } 435 436 let {exec, args} = Zotero.Fulltext.getPDFConverterExecAndArgs(); 437 args.push('-json', '-l', pages, filePath, cacheFile.path); 438 439 Zotero.debug("RecognizePDF: Running " + exec.path + " " + args.map(arg => "'" + arg + "'").join(" ")); 440 441 try { 442 await Zotero.Utilities.Internal.exec(exec, args); 443 let content = await Zotero.File.getContentsAsync(cacheFile.path); 444 Zotero.debug("RecognizePDF: Extracted JSON:"); 445 Zotero.debug(content); 446 cacheFile.remove(false); 447 return JSON.parse(content); 448 } 449 catch (e) { 450 Zotero.logError(e); 451 try { 452 cacheFile.remove(false); 453 } catch(e) { 454 Zotero.logError(e); 455 } 456 throw new Zotero.Exception.Alert("recognizePDF.couldNotRead"); 457 } 458 } 459 460 /** 461 * Attach appropriate handlers to a Zotero.Translate instance and begin translation 462 * @return {Promise} 463 */ 464 async function _promiseTranslate(translate, libraryID) { 465 translate.setHandler('select', function (translate, items, callback) { 466 for (let i in items) { 467 let obj = {}; 468 obj[i] = items[i]; 469 callback(obj); 470 return; 471 } 472 }); 473 474 let newItems = await translate.translate({ 475 libraryID, 476 saveAttachments: false 477 }); 478 if (newItems.length) { 479 return newItems[0]; 480 } 481 throw new Error('No items found'); 482 } 483 484 async function _query(json) { 485 // TODO: Use main API URL for recognizer server 486 //let uri = Zotero.Prefs.get("api.url") || ZOTERO_CONFIG.API_URL; 487 let uri = Zotero.Prefs.get("api.url") || ZOTERO_CONFIG.RECOGNIZE_URL; 488 489 if (!uri.endsWith('/')) { 490 uri += '/'; 491 } 492 493 uri += 'recognize'; 494 495 let client = Zotero.Sync.Runner.getAPIClient(); 496 497 let req = await client.makeRequest( 498 'POST', 499 uri, 500 { 501 successCodes: [200], 502 headers: { 503 'Content-Type': 'application/json' 504 }, 505 body: JSON.stringify(json), 506 noAPIKey: true 507 } 508 ); 509 510 return JSON.parse(req.responseText); 511 } 512 513 /** 514 * Retrieves metadata for a PDF and saves it as an item 515 * @param {Zotero.Item} item 516 * @return {Promise} 517 */ 518 async function _recognize(item) { 519 let filePath = await item.getFilePath(); 520 521 if (!filePath || !await OS.File.exists(filePath)) throw new Zotero.Exception.Alert('recognizePDF.fileNotFound'); 522 523 let json = await extractJSON(filePath, MAX_PAGES); 524 525 let containingTextPages = 0; 526 527 for(let page of json.pages) { 528 if(page[2].length) { 529 containingTextPages++; 530 } 531 } 532 533 if(!containingTextPages) { 534 throw new Zotero.Exception.Alert('recognizePDF.noOCR'); 535 } 536 537 let libraryID = item.libraryID; 538 539 let res = await _query(json); 540 if (!res) return null; 541 542 if (res.arxiv) { 543 Zotero.debug('RecognizePDF: Getting metadata by arXiv'); 544 let translate = new Zotero.Translate.Search(); 545 translate.setIdentifier({arXiv: res.arxiv}); 546 let translators = await translate.getTranslators(); 547 translate.setTranslator(translators); 548 549 try { 550 let newItem = await _promiseTranslate(translate, libraryID); 551 if (!newItem.abstractNote && res.abstract) { 552 newItem.setField('abstractNote', res.abstract); 553 } 554 if (!newItem.language && res.language) { 555 newItem.setField('language', res.language); 556 } 557 newItem.saveTx(); 558 return newItem; 559 } 560 catch (e) { 561 Zotero.debug('RecognizePDF: ' + e); 562 } 563 } 564 565 if (res.doi) { 566 Zotero.debug('RecognizePDF: Getting metadata by DOI'); 567 let translate = new Zotero.Translate.Search(); 568 translate.setIdentifier({ 569 DOI: res.doi 570 }); 571 let translators = await translate.getTranslators(); 572 translate.setTranslator(translators); 573 574 try { 575 let newItem = await _promiseTranslate(translate, libraryID); 576 if (!newItem.abstractNote && res.abstract) { 577 newItem.setField('abstractNote', res.abstract); 578 } 579 if (!newItem.language && res.language) { 580 newItem.setField('language', res.language); 581 } 582 newItem.saveTx(); 583 return newItem; 584 } 585 catch (e) { 586 Zotero.debug('RecognizePDF: ' + e); 587 } 588 } 589 590 if (res.isbn) { 591 Zotero.debug('RecognizePDF: Getting metadata by ISBN'); 592 let translate = new Zotero.Translate.Search(); 593 translate.setSearch({'itemType': 'book', 'ISBN': res.isbn}); 594 try { 595 let translatedItems = await translate.translate({ 596 libraryID: false, 597 saveAttachments: false 598 }); 599 Zotero.debug('RecognizePDF: Translated items:'); 600 Zotero.debug(translatedItems); 601 if (translatedItems.length) { 602 let newItem = new Zotero.Item; 603 newItem.libraryID = libraryID; 604 // Convert tags to automatic. For other items this is done automatically in 605 // translate.js for other items, but for ISBNs we just get the data 606 // (libraryID=false) and do the saving manually. 607 translatedItems[0].tags = translatedItems[0].tags.map(tag => { 608 if (typeof tag == 'string') { 609 return { 610 tag, 611 type: 1 612 }; 613 } 614 tag.type = 1; 615 return tag; 616 }); 617 newItem.fromJSON(translatedItems[0]); 618 if (!newItem.abstractNote && res.abstract) { 619 newItem.setField('abstractNote', res.abstract); 620 } 621 if (!newItem.language && res.language) { 622 newItem.setField('language', res.language); 623 } 624 newItem.saveTx(); 625 return newItem; 626 } 627 } 628 catch (e) { 629 Zotero.debug('RecognizePDF: ' + e); 630 } 631 } 632 633 if (res.title) { 634 let type = 'journalArticle'; 635 636 if (res.type === 'book-chapter') { 637 type = 'bookSection'; 638 } 639 640 let newItem = new Zotero.Item(type); 641 newItem.libraryID = libraryID; 642 newItem.setField('title', res.title); 643 644 let creators = []; 645 for (let author of res.authors) { 646 creators.push({ 647 firstName: author.firstName, 648 lastName: author.lastName, 649 creatorType: 'author' 650 }) 651 } 652 653 newItem.setCreators(creators); 654 655 if (res.abstract) newItem.setField('abstractNote', res.abstract); 656 if (res.year) newItem.setField('date', res.year); 657 if (res.pages) newItem.setField('pages', res.pages); 658 if (res.volume) newItem.setField('volume', res.volume); 659 if (res.url) newItem.setField('url', res.url); 660 if (res.language) newItem.setField('language', res.language); 661 662 if (type === 'journalArticle') { 663 if (res.issue) newItem.setField('issue', res.issue); 664 if (res.ISSN) newItem.setField('issn', res.issn); 665 if (res.container) newItem.setField('publicationTitle', res.container); 666 } 667 else if (type === 'bookSection') { 668 if (res.container) newItem.setField('bookTitle', res.container); 669 if (res.publisher) newItem.setField('publisher', res.publisher); 670 } 671 672 newItem.setField('libraryCatalog', 'Zotero'); 673 674 await newItem.saveTx(); 675 return newItem; 676 } 677 678 return null; 679 } 680 }; 681