fulltext.js (48434B)
1 /* 2 ***** BEGIN LICENSE BLOCK ***** 3 4 Copyright © 2009 Center for History and New Media 5 George Mason University, Fairfax, Virginia, USA 6 http://zotero.org 7 8 This file is part of Zotero. 9 10 Zotero is free software: you can redistribute it and/or modify 11 it under the terms of the GNU Affero General Public License as published by 12 the Free Software Foundation, either version 3 of the License, or 13 (at your option) any later version. 14 15 Zotero is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU Affero General Public License for more details. 19 20 You should have received a copy of the GNU Affero General Public License 21 along with Zotero. If not, see <http://www.gnu.org/licenses/>. 22 23 ***** END LICENSE BLOCK ***** 24 */ 25 26 Zotero.Fulltext = Zotero.FullText = new function(){ 27 this.isCachedMIMEType = isCachedMIMEType; 28 29 this.__defineGetter__("pdfConverterCacheFile", function () { return '.zotero-ft-cache'; }); 30 this.__defineGetter__("pdfInfoCacheFile", function () { return '.zotero-ft-info'; }); 31 32 this.INDEX_STATE_UNAVAILABLE = 0; 33 this.INDEX_STATE_UNINDEXED = 1; 34 this.INDEX_STATE_PARTIAL = 2; 35 this.INDEX_STATE_INDEXED = 3; 36 this.INDEX_STATE_QUEUED = 4; 37 38 this.SYNC_STATE_UNSYNCED = 0; 39 this.SYNC_STATE_IN_SYNC = 1; 40 this.SYNC_STATE_TO_PROCESS = 2; 41 this.SYNC_STATE_TO_DOWNLOAD = 3; 42 this.SYNC_STATE_MISSING = 4; 43 44 const _processorCacheFile = '.zotero-ft-unprocessed'; 45 46 const kWbClassSpace = 0; 47 const kWbClassAlphaLetter = 1; 48 const kWbClassPunct = 2; 49 const kWbClassHanLetter = 3; 50 const kWbClassKatakanaLetter = 4; 51 const kWbClassHiraganaLetter = 5; 52 const kWbClassHWKatakanaLetter = 6; 53 const kWbClassThaiLetter = 7; 54 55 var _pdfConverter = null; // nsIFile to executable 56 var _pdfInfo = null; // nsIFile to executable 57 var _pdfData = null; 58 59 var _idleObserverIsRegistered = false; 60 var _idleObserverDelay = 30; 61 var _processorTimeoutID = null; 62 var _processorBlacklist = {}; 63 var _upgradeCheck = true; 64 var _syncLibraryVersion = 0; 65 66 this.init = Zotero.Promise.coroutine(function* () { 67 yield Zotero.DB.queryAsync("ATTACH ':memory:' AS 'indexing'"); 68 yield Zotero.DB.queryAsync('CREATE TABLE indexing.fulltextWords (word NOT NULL)'); 69 70 this.decoder = Components.classes["@mozilla.org/intl/utf8converterservice;1"]. 71 getService(Components.interfaces.nsIUTF8ConverterService); 72 73 let pdfConverterFileName = "pdftotext"; 74 let pdfInfoFileName = "pdfinfo"; 75 76 if (Zotero.isWin) { 77 pdfConverterFileName += '.exe'; 78 pdfInfoFileName += '.exe'; 79 } 80 81 let dir = FileUtils.getDir('AChrom', []).parent; 82 83 _pdfData = dir.clone(); 84 _pdfData.append('poppler-data'); 85 _pdfData = _pdfData.path; 86 87 _pdfConverter = dir.clone(); 88 _pdfInfo = dir.clone(); 89 90 if(Zotero.isMac) { 91 _pdfConverter = _pdfConverter.parent; 92 _pdfConverter.append('MacOS'); 93 94 _pdfInfo = _pdfInfo.parent; 95 _pdfInfo.append('MacOS'); 96 } 97 98 _pdfConverter.append(pdfConverterFileName); 99 _pdfInfo.append(pdfInfoFileName); 100 101 Zotero.uiReadyPromise.delay(30000).then(() => { 102 this.registerContentProcessor(); 103 Zotero.addShutdownListener(this.unregisterContentProcessor.bind(this)); 104 105 // Start/stop content processor with full-text content syncing pref 106 Zotero.Prefs.registerObserver('sync.fulltext.enabled', (enabled) => { 107 if (enabled) { 108 this.registerContentProcessor(); 109 } 110 else { 111 this.unregisterContentProcessor(); 112 } 113 }); 114 115 // Stop content processor during syncs 116 Zotero.Notifier.registerObserver( 117 { 118 notify: Zotero.Promise.method(function (event, type, ids, extraData) { 119 if (event == 'start') { 120 this.unregisterContentProcessor(); 121 } 122 else if (event == 'stop') { 123 this.registerContentProcessor(); 124 } 125 }.bind(this)) 126 }, 127 ['sync'], 128 'fulltext' 129 ); 130 }); 131 }); 132 133 134 this.setPDFConverterPath = function(path) { 135 _pdfConverter = Zotero.File.pathToFile(path); 136 }; 137 138 139 this.setPDFInfoPath = function(path) { 140 _pdfInfo = Zotero.File.pathToFile(path); 141 142 }; 143 144 145 this.setPDFDataPath = function(path) { 146 _pdfData = path; 147 }; 148 149 150 this.getLibraryVersion = function (libraryID) { 151 if (!libraryID) throw new Error("libraryID not provided"); 152 return Zotero.DB.valueQueryAsync( 153 "SELECT version FROM version WHERE schema=?", "fulltext_" + libraryID 154 ) 155 }; 156 157 158 this.setLibraryVersion = Zotero.Promise.coroutine(function* (libraryID, version) { 159 if (!libraryID) throw new Error("libraryID not provided"); 160 yield Zotero.DB.queryAsync( 161 "REPLACE INTO version VALUES (?, ?)", ["fulltext_" + libraryID, version] 162 ); 163 }); 164 165 166 this.clearLibraryVersion = function (libraryID) { 167 return Zotero.DB.queryAsync("DELETE FROM version WHERE schema=?", "fulltext_" + libraryID); 168 }; 169 170 171 this.getItemVersion = Zotero.Promise.coroutine(function* (itemID) { 172 return Zotero.DB.valueQueryAsync( 173 "SELECT version FROM fulltextItems WHERE itemID=?", itemID 174 ) 175 }); 176 177 178 this.setItemSynced = function (itemID, version) { 179 return Zotero.DB.queryAsync( 180 "UPDATE fulltextItems SET synced=?, version=? WHERE itemID=?", 181 [this.SYNC_STATE_IN_SYNC, version, itemID] 182 ); 183 }; 184 185 186 // this is a port from http://mxr.mozilla.org/mozilla-central/source/intl/lwbrk/src/nsSampleWordBreaker.cpp to 187 // Javascript to avoid the overhead of xpcom calls. The port keeps to the mozilla naming of interfaces/constants as 188 // closely as possible. 189 function getClass(c, cc) { 190 if (cc < 0x2E80) { //alphabetical script 191 if ((cc & 0xFF80) == 0) { // ascii 192 if (c == ' ' || c == "\t" || c == "\r" || c == "\n") { return kWbClassSpace; } 193 if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { return kWbClassAlphaLetter; } 194 return kWbClassPunct; 195 } 196 if ((0xFF80 & cc) == 0x0E00) { return kWbClassThaiLetter; } 197 if (cc == 0x00A0/*NBSP*/) { return kWbClassSpace; } 198 199 // General and Supplemental Unicode punctuation 200 if ((cc >= 0x2000 && cc <= 0x206f) || (cc >= 0x2e00 && cc <= 0x2e7f)) { return kWbClassPunct; } 201 202 return kWbClassAlphaLetter; 203 } 204 205 if ((cc >= 0x3400 && cc <= 0x9fff) || (cc>= 0xf900 && cc <= 0xfaff)) /*han*/ { return kWbClassHanLetter; } 206 if (cc >= 0x30A0 && cc <= 0x30FF) { return kWbClassKatakanaLetter; } 207 if (cc >= 0x3040 && cc <= 0x309F) { return kWbClassHiraganaLetter; } 208 if (cc>= 0xFF60 && cc <= 0xFF9F) { return kWbClassHWKatakanaLetter; } 209 return kWbClassAlphaLetter; 210 } 211 212 213 this.getPDFConverterExecAndArgs = function () { 214 return { 215 exec: _pdfConverter, 216 args: ['-datadir', _pdfData] 217 } 218 }; 219 220 221 /* 222 * Returns true if MIME type is converted to text and cached before indexing 223 * (e.g. application/pdf is run through pdftotext) 224 */ 225 function isCachedMIMEType(mimeType) { 226 switch (mimeType) { 227 case 'application/pdf': 228 return true; 229 } 230 return false; 231 } 232 233 234 /** 235 * Index multiple words at once 236 * 237 * @requireTransaction 238 * @param {Number} itemID 239 * @param {Array<string>} words 240 * @return {Promise} 241 */ 242 var indexWords = Zotero.Promise.coroutine(function* (itemID, words) { 243 Zotero.DB.requireTransaction(); 244 let chunk; 245 yield Zotero.DB.queryAsync("DELETE FROM indexing.fulltextWords"); 246 while (words.length > 0) { 247 chunk = words.splice(0, 100); 248 yield Zotero.DB.queryAsync('INSERT INTO indexing.fulltextWords (word) ' + chunk.map(x => 'SELECT ?').join(' UNION '), chunk); 249 } 250 yield Zotero.DB.queryAsync('INSERT OR IGNORE INTO fulltextWords (word) SELECT word FROM indexing.fulltextWords'); 251 yield Zotero.DB.queryAsync('DELETE FROM fulltextItemWords WHERE itemID = ?', [itemID]); 252 yield Zotero.DB.queryAsync('INSERT OR IGNORE INTO fulltextItemWords (wordID, itemID) SELECT wordID, ? FROM fulltextWords JOIN indexing.fulltextWords USING(word)', [itemID]); 253 yield Zotero.DB.queryAsync("REPLACE INTO fulltextItems (itemID, version) VALUES (?,?)", [itemID, 0]); 254 yield Zotero.DB.queryAsync("DELETE FROM indexing.fulltextWords"); 255 }); 256 257 258 /** 259 * @return {Promise} 260 */ 261 var indexString = Zotero.Promise.coroutine(function* (text, charset, itemID, stats, version, synced) { 262 var words = this.semanticSplitter(text, charset); 263 264 while (Zotero.DB.inTransaction()) { 265 yield Zotero.DB.waitForTransaction('indexString()'); 266 } 267 268 yield Zotero.DB.executeTransaction(function* () { 269 this.clearItemWords(itemID, true); 270 yield indexWords(itemID, words, stats, version, synced); 271 272 var sql = "UPDATE fulltextItems SET synced=?"; 273 var params = [synced ? parseInt(synced) : this.SYNC_STATE_UNSYNCED]; 274 if (stats) { 275 for (let stat in stats) { 276 sql += ", " + stat + "=?"; 277 params.push(stats[stat] ? parseInt(stats[stat]) : null); 278 } 279 } 280 if (version) { 281 sql += ", version=?"; 282 params.push(parseInt(version)); 283 } 284 sql += " WHERE itemID=?"; 285 params.push(itemID); 286 yield Zotero.DB.queryAsync(sql, params); 287 288 /* 289 var sql = "REPLACE INTO fulltextContent (itemID, textContent) VALUES (?,?)"; 290 Zotero.DB.query(sql, [itemID, {string:text}]); 291 */ 292 293 Zotero.Notifier.queue('refresh', 'item', itemID); 294 }.bind(this)); 295 296 // If there's a processor cache file, delete it (whether or not we just used it) 297 var item = yield Zotero.Items.getAsync(itemID); 298 var cacheFile = this.getItemProcessorCacheFile(item); 299 if (cacheFile.exists()) { 300 cacheFile.remove(false); 301 } 302 }.bind(this)); 303 304 305 /** 306 * @param {Document} document 307 * @param {Number} itemID 308 * @return {Promise} 309 */ 310 this.indexDocument = Zotero.Promise.coroutine(function* (document, itemID) { 311 if (!itemID){ 312 throw ('Item ID not provided to indexDocument()'); 313 } 314 315 Zotero.debug("Indexing document '" + document.title + "'"); 316 317 if (!Zotero.MIME.isTextType(document.contentType)) { 318 Zotero.debug(document.contentType + " document is not text", 2); 319 return false; 320 } 321 322 if (!document.body) { 323 Zotero.debug("Cannot index " + document.contentType + " file", 2); 324 return false; 325 } 326 327 if (!document.characterSet){ 328 Zotero.debug("Text file didn't have charset", 2); 329 return false; 330 } 331 332 var maxLength = Zotero.Prefs.get('fulltext.textMaxLength'); 333 if (!maxLength) { 334 return false; 335 } 336 var obj = yield convertItemHTMLToText(itemID, document.body.innerHTML, maxLength); 337 var text = obj.text; 338 var totalChars = obj.totalChars; 339 340 if (totalChars > maxLength) { 341 Zotero.debug('Only indexing first ' + maxLength + ' characters of item ' 342 + itemID + ' in indexDocument()'); 343 } 344 345 yield indexString(text, document.characterSet, itemID); 346 yield setChars(itemID, { indexed: text.length, total: totalChars }); 347 }); 348 349 350 /** 351 * @param {String} path 352 * @param {Boolean} [complete=FALSE] Index the file in its entirety, ignoring maxLength 353 */ 354 var indexFile = Zotero.Promise.coroutine(function* (path, contentType, charset, itemID, complete, isCacheFile) { 355 if (!(yield OS.File.exists(path))) { 356 Zotero.debug('File not found in indexFile()', 2); 357 return false; 358 } 359 360 if (!contentType) { 361 Zotero.debug("Content type not provided in indexFile()", 1); 362 return false; 363 } 364 365 if (!itemID) { 366 throw new Error('Item ID not provided'); 367 } 368 369 if (contentType == 'application/pdf') { 370 return this.indexPDF(path, itemID, complete); 371 } 372 373 if (!Zotero.MIME.isTextType(contentType)) { 374 Zotero.debug('File is not text in indexFile()', 2); 375 return false; 376 } 377 378 if (!charset) { 379 Zotero.logError(`Item ${itemID} didn't have a charset`); 380 return false; 381 } 382 383 var maxLength = Zotero.Prefs.get('fulltext.textMaxLength'); 384 if (!maxLength) { 385 return false; 386 } 387 if (complete) { 388 maxLength = null; 389 } 390 391 Zotero.debug('Indexing file ' + path); 392 var text = yield Zotero.File.getContentsAsync(path, charset); 393 var totalChars = text.length; 394 if (contentType == 'text/html') { 395 let obj = yield convertItemHTMLToText(itemID, text, maxLength); 396 text = obj.text; 397 totalChars = obj.totalChars; 398 } 399 else { 400 if (maxLength && text.length > maxLength) { 401 text = text.substr(0, maxLength); 402 } 403 } 404 405 yield indexString(text, charset, itemID); 406 407 // Record the number of characters indexed (unless we're indexing a (PDF) cache file, 408 // in which case the stats are coming from elsewhere) 409 if (!isCacheFile) { 410 yield setChars(itemID, { indexed: text.length, total: totalChars }); 411 } 412 413 return true; 414 }.bind(this)); 415 416 417 /** 418 * Run PDF through pdfinfo and pdftotext to generate .zotero-ft-info 419 * and .zotero-ft-cache, and pass the text file back to indexFile() 420 * 421 * @param {nsIFile} file 422 * @param {Number} itemID 423 * @param {Boolean} [allPages] - If true, index all pages rather than pdfMaxPages 424 * @return {Promise} 425 */ 426 this.indexPDF = Zotero.Promise.coroutine(function* (filePath, itemID, allPages) { 427 var maxPages = Zotero.Prefs.get('fulltext.pdfMaxPages'); 428 if (maxPages == 0) { 429 return false; 430 } 431 432 var item = yield Zotero.Items.getAsync(itemID); 433 var linkMode = item.attachmentLinkMode; 434 // If file is stored outside of Zotero, create a directory for the item 435 // in the storage directory and save the cache file there 436 if (linkMode == Zotero.Attachments.LINK_MODE_LINKED_FILE) { 437 var parentDirPath = yield Zotero.Attachments.createDirectoryForItem(item); 438 } 439 else { 440 var parentDirPath = OS.Path.dirname(filePath); 441 } 442 var infoFilePath = OS.Path.join(parentDirPath, this.pdfInfoCacheFile); 443 var cacheFilePath = OS.Path.join(parentDirPath, this.pdfConverterCacheFile); 444 445 446 var args = [filePath, infoFilePath]; 447 448 try { 449 yield Zotero.Utilities.Internal.exec(_pdfInfo, args); 450 var totalPages = yield getTotalPagesFromFile(itemID); 451 } 452 catch (e) { 453 Zotero.debug("Error running " + _pdfInfo.path, 1); 454 Zotero.logError(e); 455 } 456 457 458 var {exec, args} = this.getPDFConverterExecAndArgs(); 459 args.push('-nopgbrk'); 460 461 if (allPages) { 462 if (totalPages) { 463 var pagesIndexed = totalPages; 464 } 465 } 466 else { 467 args.push('-l', maxPages); 468 var pagesIndexed = Math.min(maxPages, totalPages); 469 } 470 args.push(filePath, cacheFilePath); 471 472 try { 473 yield Zotero.Utilities.Internal.exec(exec, args); 474 } 475 catch (e) { 476 Zotero.debug("Error running " + exec.path, 1); 477 Zotero.logError(e); 478 return false; 479 } 480 481 if (!(yield OS.File.exists(cacheFilePath))) { 482 let fileName = OS.Path.basename(filePath); 483 let msg = fileName + " was not indexed"; 484 if (!fileName.match(/^[\u0000-\u007F]+$/)) { 485 msg += " -- PDFs with filenames containing extended characters cannot currently be indexed due to a Mozilla limitation"; 486 } 487 Zotero.debug(msg, 2); 488 Components.utils.reportError(msg); 489 return false; 490 } 491 492 yield indexFile(cacheFilePath, 'text/plain', 'utf-8', itemID, true, true); 493 yield setPages(itemID, { indexed: pagesIndexed, total: totalPages }); 494 495 return true; 496 }); 497 498 499 /** 500 * @param {Integer[]|Integer} items - One or more itemIDs 501 */ 502 this.indexItems = Zotero.Promise.coroutine(function* (items, complete, ignoreErrors) { 503 if (!Array.isArray(items)) { 504 items = [items]; 505 } 506 var items = yield Zotero.Items.getAsync(items); 507 var found = []; 508 509 for (let i=0; i<items.length; i++) { 510 let item = items[i]; 511 if (!item.isAttachment()) { 512 continue; 513 } 514 515 Zotero.debug("Indexing item " + item.libraryKey); 516 let itemID = item.id; 517 518 var path = yield item.getFilePathAsync(); 519 if (!path) { 520 if (yield OS.File.exists(this.getItemProcessorCacheFile(item).path)) { 521 yield Zotero.Fulltext.indexFromProcessorCache(itemID); 522 } 523 else { 524 Zotero.debug("No file to index for item " + item.libraryKey 525 + " in Zotero.FullText.indexItems()"); 526 } 527 continue; 528 } 529 530 try { 531 yield indexFile(path, item.attachmentContentType, item.attachmentCharset, itemID, complete); 532 } 533 catch (e) { 534 if (ignoreErrors) { 535 Components.utils.reportError("Error indexing " + path); 536 Zotero.logError(e); 537 } 538 else { 539 throw e; 540 } 541 } 542 } 543 }); 544 545 546 // TEMP: Temporary mechanism to serialize indexing of new attachments 547 // 548 // This should instead save the itemID to a table that's read by the content processor 549 var _queue = []; 550 var _indexing = false; 551 var _nextIndexTime; 552 var _indexDelay = 5000; 553 var _indexInterval = 500; 554 this.queueItem = function (item) { 555 // Don't index files in the background during tests 556 if (Zotero.test) return; 557 558 _queue.push(item.id); 559 _nextIndexTime = Date.now() + _indexDelay; 560 setTimeout(() => { 561 _processNextItem() 562 }, _indexDelay); 563 }; 564 565 async function _processNextItem() { 566 if (!_queue.length) return; 567 // Another _processNextItem() was scheduled 568 if (Date.now() < _nextIndexTime) return; 569 // If indexing is already running, _processNextItem() will be called when it's done 570 if (_indexing) return; 571 _indexing = true; 572 var itemID = _queue.shift(); 573 try { 574 await Zotero.Fulltext.indexItems([itemID], false, true); 575 } 576 finally { 577 _indexing = false; 578 } 579 setTimeout(() => { 580 _processNextItem(); 581 }, _indexInterval); 582 }; 583 584 585 // 586 // Full-text content syncing 587 // 588 /** 589 * Get content and stats that haven't yet been synced 590 * 591 * @param {Integer} libraryID 592 * @param {Integer} [options] 593 * @param {Integer} [options.maxSize] 594 * @param {Integer} [options.maxItems] 595 * @param {Integer} [options.lastItemID] - Only return content for items above this id 596 * @return {Promise<Array<Object>>} 597 */ 598 this.getUnsyncedContent = Zotero.Promise.coroutine(function* (libraryID, options = {}) { 599 var contentItems = []; 600 var sql = "SELECT itemID, indexedChars, totalChars, indexedPages, totalPages " 601 + "FROM fulltextItems FI JOIN items I USING (itemID) WHERE libraryID=? AND " 602 + "FI.synced=? AND I.synced=1 "; 603 var params = [libraryID, this.SYNC_STATE_UNSYNCED]; 604 if (options.lastItemID) { 605 sql += "AND itemID>?"; 606 params.push(options.lastItemID); 607 } 608 sql += "ORDER BY itemID"; 609 var rows = yield Zotero.DB.queryAsync(sql, params); 610 var contentSize = 0; 611 for (let i = 0; i < rows.length; i++) { 612 let row = rows[i]; 613 let content; 614 let itemID = row.itemID; 615 let item = yield Zotero.Items.getAsync(itemID); 616 let libraryKey = item.libraryKey; 617 let contentType = item.attachmentContentType; 618 if (contentType && (isCachedMIMEType(contentType) || Zotero.MIME.isTextType(contentType))) { 619 try { 620 let cacheFile = this.getItemCacheFile(item).path; 621 if (yield OS.File.exists(cacheFile)) { 622 Zotero.debug("Getting full-text content from cache " 623 + "file for item " + libraryKey); 624 content = yield Zotero.File.getContentsAsync(cacheFile); 625 } 626 else { 627 // If there should be a cache file and isn't, mark the full text as missing 628 if (!Zotero.MIME.isTextType(contentType)) { 629 Zotero.debug("Full-text content cache file doesn't exist for item " 630 + libraryKey, 2); 631 let sql = "UPDATE fulltextItems SET synced=? WHERE itemID=?"; 632 yield Zotero.DB.queryAsync(sql, [this.SYNC_STATE_MISSING, item.id]); 633 continue; 634 } 635 636 // Same for missing attachments 637 let path = yield item.getFilePathAsync(); 638 if (!path) { 639 Zotero.debug("File doesn't exist getting full-text content for item " 640 + libraryKey, 2); 641 let sql = "UPDATE fulltextItems SET synced=? WHERE itemID=?"; 642 yield Zotero.DB.queryAsync(sql, [this.SYNC_STATE_MISSING, item.id]); 643 continue; 644 } 645 646 Zotero.debug("Getting full-text content from file for item " + libraryKey); 647 content = yield Zotero.File.getContentsAsync(path, item.attachmentCharset); 648 649 // If HTML, convert to plain text first, and cache the result 650 if (item.attachmentContentType == 'text/html') { 651 let obj = yield convertItemHTMLToText( 652 itemID, 653 content, 654 // Include in the cache file only as many characters as we 655 // indexed previously 656 row.indexedChars 657 ); 658 content = obj.text; 659 } 660 else { 661 // Include only as many characters as we've indexed 662 content = content.substr(0, row.indexedChars); 663 } 664 } 665 } 666 catch (e) { 667 Zotero.logError(e); 668 continue; 669 } 670 } 671 else { 672 Zotero.debug("Skipping non-text file getting full-text content for item " 673 + `${libraryKey} (contentType: ${contentType})`, 2); 674 675 // Delete rows for items that weren't supposed to be indexed 676 yield Zotero.DB.executeTransaction(function* () { 677 yield this.clearItemWords(itemID); 678 }.bind(this)); 679 continue; 680 } 681 682 // If this isn't the first item and it would put us over the size limit, stop 683 if (contentItems.length && options.maxSize && contentSize + content.length > options.maxSize) { 684 break; 685 } 686 687 contentItems.push({ 688 itemID: item.id, 689 key: item.key, 690 content, 691 indexedChars: row.indexedChars ? row.indexedChars : 0, 692 totalChars: row.totalChars ? row.totalChars : 0, 693 indexedPages: row.indexedPages ? row.indexedPages : 0, 694 totalPages: row.totalPages ? row.totalPages : 0 695 }); 696 697 if (options.maxItems && contentItems.length >= options.maxItems) { 698 break; 699 } 700 contentSize += content.length; 701 } 702 return contentItems; 703 }); 704 705 706 /** 707 * @return {String} PHP-formatted POST data for items not yet downloaded 708 */ 709 this.getUndownloadedPostData = Zotero.Promise.coroutine(function* () { 710 // TODO: Redo for API syncing 711 712 // On upgrade, get all content 713 var sql = "SELECT value FROM settings WHERE setting='fulltext' AND key='downloadAll'"; 714 if (yield Zotero.DB.valueQueryAsync(sql)) { 715 return "&ftkeys=all"; 716 } 717 718 var sql = "SELECT itemID FROM fulltextItems WHERE synced=" + this.SYNC_STATE_TO_DOWNLOAD; 719 var itemIDs = yield Zotero.DB.columnQueryAsync(sql); 720 if (!itemIDs) { 721 return ""; 722 } 723 var undownloaded = {}; 724 for (let i=0; i<itemIDs.length; i++) { 725 let itemID = itemIDs[i]; 726 let item = yield Zotero.Items.getAsync(itemID); 727 let libraryID = item.libraryID 728 if (!undownloaded[libraryID]) { 729 undownloaded[libraryID] = []; 730 } 731 undownloaded[libraryID].push(item.key); 732 } 733 var data = ""; 734 for (let libraryID in undownloaded) { 735 for (let i = 0; i < undownloaded[libraryID].length; i++) { 736 data += "&" + encodeURIComponent("ftkeys[" + libraryID + "][" + i + "]") 737 + "=" + undownloaded[libraryID][i]; 738 } 739 } 740 return data; 741 }); 742 743 744 /** 745 * Save full-text content and stats to a cache file 746 * 747 * @param {Integer} libraryID 748 * @param {String} key - Item key 749 * @param {Object} data 750 * @param {String} data.content 751 * @param {Integer} [data.indexedChars] 752 * @param {Integer} [data.totalChars] 753 * @param {Integer} [data.indexedPages] 754 * @param {Integer} [data.totalPages] 755 * @param {Integer} version 756 * @return {Promise} 757 */ 758 this.setItemContent = Zotero.Promise.coroutine(function* (libraryID, key, data, version) { 759 var libraryKey = libraryID + "/" + key; 760 var item = Zotero.Items.getByLibraryAndKey(libraryID, key); 761 if (!item) { 762 let msg = "Item " + libraryKey + " not found setting full-text content"; 763 Zotero.logError(msg); 764 return; 765 } 766 var itemID = item.id; 767 var currentVersion = yield this.getItemVersion(itemID) 768 769 var processorCacheFile = this.getItemProcessorCacheFile(item).path; // .zotero-ft-unprocessed 770 var itemCacheFile = this.getItemCacheFile(item).path; // .zotero-ft-cache 771 772 // If a storage directory doesn't exist, create it 773 if (!(yield OS.File.exists(OS.Path.dirname(processorCacheFile)))) { 774 yield Zotero.Attachments.createDirectoryForItem(item); 775 } 776 777 // If indexed previously and the existing extracted text matches the new text, 778 // just update the version 779 if (currentVersion !== false 780 && (yield OS.File.exists(itemCacheFile)) 781 && (yield Zotero.File.getContentsAsync(itemCacheFile)) == data.content) { 782 Zotero.debug("Current full-text content matches remote for item " 783 + libraryKey + " -- updating version"); 784 return Zotero.DB.queryAsync( 785 "REPLACE INTO fulltextItems (itemID, version, synced) VALUES (?, ?, ?)", 786 [itemID, version, this.SYNC_STATE_IN_SYNC] 787 ); 788 } 789 790 // Otherwise save data to -unprocessed file 791 Zotero.debug("Writing full-text content and data for item " + libraryKey 792 + " to " + processorCacheFile); 793 yield Zotero.File.putContentsAsync(processorCacheFile, JSON.stringify({ 794 indexedChars: data.indexedChars, 795 totalChars: data.totalChars, 796 indexedPages: data.indexedPages, 797 totalPages: data.totalPages, 798 version, 799 text: data.content 800 })); 801 var synced = this.SYNC_STATE_TO_PROCESS; 802 803 // If indexed previously, update the sync state 804 if (currentVersion !== false) { 805 yield Zotero.DB.queryAsync("UPDATE fulltextItems SET synced=? WHERE itemID=?", [synced, itemID]); 806 } 807 // If not yet indexed, add an empty row 808 else { 809 yield Zotero.DB.queryAsync( 810 "REPLACE INTO fulltextItems (itemID, version, synced) VALUES (?, 0, ?)", 811 [itemID, synced] 812 ); 813 } 814 815 this.registerContentProcessor(); 816 }); 817 818 819 /** 820 * Start the idle observer for the background content processor 821 */ 822 this.registerContentProcessor = function () { 823 // Don't start idle observer during tests 824 if (Zotero.test) return; 825 if (!Zotero.Prefs.get('sync.fulltext.enabled')) return; 826 827 if (!_idleObserverIsRegistered) { 828 Zotero.debug("Starting full-text content processor"); 829 var idleService = Components.classes["@mozilla.org/widget/idleservice;1"] 830 .getService(Components.interfaces.nsIIdleService); 831 idleService.addIdleObserver(this.idleObserver, _idleObserverDelay); 832 _idleObserverIsRegistered = true; 833 } 834 } 835 836 837 this.unregisterContentProcessor = function () { 838 if (_idleObserverIsRegistered) { 839 Zotero.debug("Unregistering full-text content processor idle observer"); 840 var idleService = Components.classes["@mozilla.org/widget/idleservice;1"] 841 .getService(Components.interfaces.nsIIdleService); 842 idleService.removeIdleObserver(this.idleObserver, _idleObserverDelay); 843 _idleObserverIsRegistered = false; 844 } 845 846 this.stopContentProcessor(); 847 } 848 849 850 /** 851 * Stop the idle observer and a running timer, if there is one 852 */ 853 this.stopContentProcessor = function () { 854 Zotero.debug("Stopping full-text content processor"); 855 if (_processorTimeoutID) { 856 clearTimeout(_processorTimeoutID); 857 _processorTimeoutID = null; 858 } 859 } 860 861 /** 862 * Find items marked as having unprocessed cache files, run cache file processing on one item, and 863 * after a short delay call self again with the remaining items 864 * 865 * @param {Array<Integer>} itemIDs An array of itemIDs to process; if this 866 * is omitted, a database query is made 867 * to find unprocessed content 868 * @return {Boolean} TRUE if there's more content to process; FALSE otherwise 869 */ 870 this.processUnprocessedContent = Zotero.Promise.coroutine(function* (itemIDs) { 871 // Idle observer can take a little while to trigger and may not cancel the setTimeout() 872 // in time, so check idle time directly 873 var idleService = Components.classes["@mozilla.org/widget/idleservice;1"] 874 .getService(Components.interfaces.nsIIdleService); 875 if (idleService.idleTime < _idleObserverDelay * 1000) { 876 return; 877 } 878 879 if (!itemIDs) { 880 Zotero.debug("Checking for unprocessed full-text content"); 881 let sql = "SELECT itemID FROM fulltextItems WHERE synced=" + this.SYNC_STATE_TO_PROCESS; 882 itemIDs = yield Zotero.DB.columnQueryAsync(sql); 883 } 884 885 var origLen = itemIDs.length; 886 itemIDs = itemIDs.filter(function (id) { 887 return !(id in _processorBlacklist); 888 }); 889 if (itemIDs.length < origLen) { 890 let skipped = (origLen - itemIDs.length); 891 Zotero.debug("Skipping large full-text content for " + skipped 892 + " item" + (skipped == 1 ? '' : 's')); 893 } 894 895 // If there's no more unprocessed content, stop the idle observer 896 if (!itemIDs.length) { 897 Zotero.debug("No unprocessed full-text content found"); 898 this.unregisterContentProcessor(); 899 return; 900 } 901 902 let itemID = itemIDs.shift(); 903 let item = yield Zotero.Items.getAsync(itemID); 904 905 Zotero.debug("Processing full-text content for item " + item.libraryKey); 906 907 yield Zotero.Fulltext.indexFromProcessorCache(itemID); 908 909 if (!itemIDs.length || idleService.idleTime < _idleObserverDelay * 1000) { 910 return; 911 } 912 913 // If there are remaining items, call self again after a short delay. The delay allows 914 // for processing to be interrupted if the user returns from idle. At least on macOS, 915 // when Zotero is in the background this can be throttled to 10 seconds. 916 _processorTimeoutID = setTimeout(() => this.processUnprocessedContent(itemIDs), 200); 917 }); 918 919 this.idleObserver = { 920 observe: function (subject, topic, data) { 921 // On idle, start the background processor 922 if (topic == 'idle') { 923 this.processUnprocessedContent(); 924 } 925 // When back from idle, stop the processor (but keep the idle observer registered) 926 else if (topic == 'active') { 927 this.stopContentProcessor(); 928 } 929 }.bind(this) 930 }; 931 932 933 /** 934 * @param {Number} itemID 935 * @return {Promise<Boolean>} 936 */ 937 this.indexFromProcessorCache = Zotero.Promise.coroutine(function* (itemID) { 938 try { 939 var item = yield Zotero.Items.getAsync(itemID); 940 var cacheFile = this.getItemProcessorCacheFile(item).path; 941 if (!(yield OS.File.exists(cacheFile))) { 942 Zotero.debug("Full-text content processor cache file doesn't exist for item " + itemID); 943 yield Zotero.DB.queryAsync( 944 "UPDATE fulltextItems SET synced=? WHERE itemID=?", 945 [this.SYNC_STATE_UNSYNCED, itemID] 946 ); 947 return false; 948 } 949 950 var json = yield Zotero.File.getContentsAsync(cacheFile); 951 var data = JSON.parse(json); 952 953 // Write the text content to the regular cache file 954 var item = yield Zotero.Items.getAsync(itemID); 955 cacheFile = this.getItemCacheFile(item).path; 956 Zotero.debug("Writing full-text content to " + cacheFile); 957 yield Zotero.File.putContentsAsync(cacheFile, data.text); 958 959 yield indexString( 960 data.text, 961 "UTF-8", 962 itemID, 963 { 964 indexedChars: data.indexedChars, 965 totalChars: data.totalChars, 966 indexedPages: data.indexedPages, 967 totalPages: data.totalPages 968 }, 969 data.version, 970 1 971 ); 972 973 return true; 974 } 975 catch (e) { 976 Components.utils.reportError(e); 977 Zotero.debug(e, 1); 978 return false; 979 }; 980 }); 981 982 // 983 // End full-text content syncing 984 // 985 986 987 /* 988 * Scan a string for another string 989 * 990 * _items_ -- one or more attachment items to search 991 * _searchText_ -- text pattern to search for 992 * _mode_: 993 * 'regexp' -- regular expression (case-insensitive) 994 * 'regexpCS' -- regular expression (case-sensitive) 995 * 996 * - Slashes in regex are optional 997 */ 998 function findTextInString(content, searchText, mode) { 999 switch (mode){ 1000 case 'regexp': 1001 case 'regexpCS': 1002 case 'regexpBinary': 1003 case 'regexpCSBinary': 1004 // Do a multiline search by default 1005 var flags = 'm'; 1006 var parts = searchText.match(/^\/(.*)\/([^\/]*)/); 1007 if (parts){ 1008 searchText = parts[1]; 1009 // Ignore user-supplied flags 1010 //flags = parts[2]; 1011 } 1012 1013 if (mode.indexOf('regexpCS')==-1){ 1014 flags += 'i'; 1015 } 1016 1017 try { 1018 var re = new RegExp(searchText, flags); 1019 var matches = re.exec(content); 1020 } 1021 catch (e) { 1022 Zotero.debug(e, 1); 1023 Components.utils.reportError(e); 1024 } 1025 if (matches){ 1026 Zotero.debug("Text found"); 1027 return content.substr(matches.index, 50); 1028 } 1029 1030 break; 1031 1032 default: 1033 // Case-insensitive 1034 searchText = searchText.toLowerCase(); 1035 content = content.toLowerCase(); 1036 1037 var pos = content.indexOf(searchText); 1038 if (pos!=-1){ 1039 Zotero.debug('Text found'); 1040 return content.substr(pos, 50); 1041 } 1042 } 1043 1044 return -1; 1045 } 1046 1047 /** 1048 * Scan item files for a text string 1049 * 1050 * _items_ -- one or more attachment items to search 1051 * _searchText_ -- text pattern to search for 1052 * _mode_: 1053 * 'phrase' 1054 * 'regexp' 1055 * 'regexpCS' -- case-sensitive regular expression 1056 * 1057 * Note: 1058 * - Slashes in regex are optional 1059 * - Add 'Binary' to the mode to search all files, not just text files 1060 * 1061 * @return {Promise<Array<Object>>} A promise for an array of match objects, with 'id' containing 1062 * an itemID and 'match' containing a string snippet 1063 */ 1064 this.findTextInItems = Zotero.Promise.coroutine(function* (items, searchText, mode){ 1065 if (!searchText){ 1066 return []; 1067 } 1068 1069 var items = yield Zotero.Items.getAsync(items); 1070 var found = []; 1071 1072 for (let i=0; i<items.length; i++) { 1073 let item = items[i]; 1074 if (!item.isAttachment()) { 1075 continue; 1076 } 1077 1078 let itemID = item.id; 1079 let content; 1080 let mimeType = item.attachmentContentType; 1081 let maxLength = Zotero.Prefs.get('fulltext.textMaxLength'); 1082 let binaryMode = mode && mode.indexOf('Binary') != -1; 1083 1084 if (isCachedMIMEType(mimeType)) { 1085 let file = this.getItemCacheFile(item).path; 1086 if (!(yield OS.File.exists(file))) { 1087 continue; 1088 } 1089 1090 Zotero.debug("Searching for text '" + searchText + "' in " + file); 1091 content = yield Zotero.File.getContentsAsync(file, 'utf-8', maxLength); 1092 } 1093 else { 1094 // If not binary mode, only scan plaintext files 1095 if (!binaryMode) { 1096 if (!Zotero.MIME.isTextType(mimeType)) { 1097 Zotero.debug('Not scanning MIME type ' + mimeType, 4); 1098 continue; 1099 } 1100 } 1101 1102 // Check for a cache file 1103 let cacheFile = this.getItemCacheFile(item).path; 1104 if (yield OS.File.exists(cacheFile)) { 1105 Zotero.debug("Searching for text '" + searchText + "' in " + cacheFile); 1106 content = yield Zotero.File.getContentsAsync(cacheFile, 'utf-8', maxLength); 1107 } 1108 else { 1109 // If that doesn't exist, check for the actual file 1110 let path = yield item.getFilePathAsync(); 1111 if (!path) { 1112 continue; 1113 } 1114 1115 Zotero.debug("Searching for text '" + searchText + "' in " + path); 1116 content = yield Zotero.File.getContentsAsync(path, item.attachmentCharset); 1117 1118 // If HTML and not binary mode, convert to text 1119 if (mimeType == 'text/html' && !binaryMode) { 1120 // Include in the cache file only as many characters as we've indexed 1121 let chars = yield getChars(itemID); 1122 1123 let obj = yield convertItemHTMLToText( 1124 itemID, content, chars ? chars.indexedChars : null 1125 ); 1126 content = obj.text; 1127 } 1128 } 1129 } 1130 1131 let match = findTextInString(content, searchText, mode); 1132 if (match != -1) { 1133 found.push({ 1134 id: itemID, 1135 match: match 1136 }); 1137 } 1138 } 1139 1140 return found; 1141 }); 1142 1143 1144 /** 1145 * @requireTransaction 1146 */ 1147 this.clearItemWords = Zotero.Promise.coroutine(function* (itemID, skipCacheClear) { 1148 Zotero.DB.requireTransaction(); 1149 1150 var sql = "SELECT rowid FROM fulltextItems WHERE itemID=? LIMIT 1"; 1151 var indexed = yield Zotero.DB.valueQueryAsync(sql, itemID); 1152 if (indexed) { 1153 yield Zotero.DB.queryAsync("DELETE FROM fulltextItemWords WHERE itemID=?", itemID); 1154 yield Zotero.DB.queryAsync("DELETE FROM fulltextItems WHERE itemID=?", itemID); 1155 } 1156 1157 if (indexed) { 1158 Zotero.Prefs.set('purge.fulltext', true); 1159 } 1160 1161 if (!skipCacheClear) { 1162 // Delete fulltext cache file if there is one 1163 yield clearCacheFile(itemID); 1164 } 1165 }); 1166 1167 1168 /** 1169 * @return {Promise} 1170 */ 1171 this.getPages = function (itemID, force) { 1172 var sql = "SELECT indexedPages, totalPages AS total " 1173 + "FROM fulltextItems WHERE itemID=?"; 1174 return Zotero.DB.rowQueryAsync(sql, itemID); 1175 } 1176 1177 1178 /** 1179 * Gets the number of pages from the PDF info cache file 1180 * 1181 * @private 1182 * @return {Promise} 1183 */ 1184 var getTotalPagesFromFile = Zotero.Promise.coroutine(function* (itemID) { 1185 var file = OS.Path.join( 1186 Zotero.Attachments.getStorageDirectoryByID(itemID).path, 1187 Zotero.Fulltext.pdfInfoCacheFile 1188 ); 1189 if (!(yield OS.File.exists(file))) { 1190 return false; 1191 } 1192 var contents = yield Zotero.File.getContentsAsync(file); 1193 try { 1194 // Parse pdfinfo output 1195 var pages = contents.match('Pages:[^0-9]+([0-9]+)')[1]; 1196 } 1197 catch (e) { 1198 Zotero.debug(e); 1199 return false; 1200 } 1201 return pages; 1202 }); 1203 1204 1205 /** 1206 * @return {Promise} 1207 */ 1208 function getChars(itemID) { 1209 var sql = "SELECT indexedChars, totalChars AS total " 1210 + "FROM fulltextItems WHERE itemID=?"; 1211 return Zotero.DB.rowQueryAsync(sql, itemID); 1212 } 1213 1214 1215 /** 1216 * Gets the number of characters from the PDF converter cache file 1217 * 1218 * @return {Promise} 1219 */ 1220 var getTotalCharsFromFile = Zotero.Promise.coroutine(function* (itemID) { 1221 var item = yield Zotero.Items.getAsync(itemID); 1222 switch (item.attachmentContentType) { 1223 case 'application/pdf': 1224 var file = OS.Path.join( 1225 Zotero.Attachments.getStorageDirectory(item).path, 1226 this.pdfConverterCacheFile 1227 ); 1228 if (!(yield OS.File.exists(file))) { 1229 return false; 1230 } 1231 break; 1232 1233 default: 1234 var file = yield item.getFilePathAsync(); 1235 if (!file) { 1236 return false; 1237 } 1238 } 1239 1240 var contents = yield Zotero.File.getContentsAsync(file); 1241 return contents.length; 1242 }); 1243 1244 1245 /** 1246 * @return {Promise} 1247 */ 1248 function setPages(itemID, obj) { 1249 var sql = "UPDATE fulltextItems SET indexedPages=?, totalPages=? WHERE itemID=?"; 1250 return Zotero.DB.queryAsync( 1251 sql, 1252 [ 1253 obj.indexed ? parseInt(obj.indexed) : null, 1254 obj.total ? parseInt(obj.total) : null, 1255 itemID 1256 ] 1257 ); 1258 } 1259 1260 1261 /** 1262 * @param {Number} itemID 1263 * @param {Object} obj 1264 * @return {Promise} 1265 */ 1266 function setChars(itemID, obj) { 1267 var sql = "UPDATE fulltextItems SET indexedChars=?, totalChars=? WHERE itemID=?"; 1268 return Zotero.DB.queryAsync( 1269 sql, 1270 [ 1271 obj.indexed ? parseInt(obj.indexed) : null, 1272 obj.total ? parseInt(obj.total) : null, 1273 itemID 1274 ] 1275 ); 1276 } 1277 1278 1279 /* 1280 * Gets the indexed state of an item, 1281 */ 1282 this.getIndexedState = Zotero.Promise.coroutine(function* (item) { 1283 if (!item.isAttachment()) { 1284 throw new Error('Item is not an attachment'); 1285 } 1286 1287 // If the file or cache file wasn't available during syncing, mark as unindexed 1288 var synced = yield Zotero.DB.valueQueryAsync( 1289 "SELECT synced FROM fulltextItems WHERE itemID=?", item.id 1290 ); 1291 if (synced === false || synced == this.SYNC_STATE_MISSING) { 1292 return this.INDEX_STATE_UNINDEXED; 1293 } 1294 1295 var itemID = item.id; 1296 var state = this.INDEX_STATE_UNINDEXED; 1297 switch (item.attachmentContentType) { 1298 // Use pages for PDFs 1299 case 'application/pdf': 1300 var o = yield this.getPages(itemID); 1301 if (o) { 1302 var stats = { 1303 indexed: o.indexedPages, 1304 total: o.total 1305 }; 1306 } 1307 break; 1308 1309 default: 1310 var o = yield getChars(itemID); 1311 if (o) { 1312 var stats = { 1313 indexed: o.indexedChars, 1314 total: o.total 1315 }; 1316 } 1317 } 1318 1319 if (stats) { 1320 if (!stats.total && !stats.indexed) { 1321 let queued = false; 1322 try { 1323 queued = yield OS.File.exists(this.getItemProcessorCacheFile(item).path); 1324 } 1325 catch (e) { 1326 Zotero.logError(e); 1327 } 1328 state = queued ? this.INDEX_STATE_QUEUED : this.INDEX_STATE_UNAVAILABLE; 1329 } 1330 else if (!stats.indexed) { 1331 state = this.INDEX_STATE_UNINDEXED; 1332 } 1333 else if (stats.indexed < stats.total) { 1334 state = this.INDEX_STATE_PARTIAL; 1335 } 1336 else { 1337 state = this.INDEX_STATE_INDEXED; 1338 } 1339 } 1340 return state; 1341 }); 1342 1343 1344 this.isFullyIndexed = Zotero.Promise.coroutine(function* (item) { 1345 return (yield this.getIndexedState(item)) == this.INDEX_STATE_INDEXED; 1346 }); 1347 1348 1349 /** 1350 * @return {Promise} 1351 */ 1352 this.getIndexStats = Zotero.Promise.coroutine(function* () { 1353 var sql = "SELECT COUNT(*) FROM fulltextItems WHERE synced != ? AND " 1354 + "((indexedPages IS NOT NULL AND indexedPages=totalPages) OR " 1355 + "(indexedChars IS NOT NULL AND indexedChars=totalChars))" 1356 var indexed = yield Zotero.DB.valueQueryAsync(sql, this.SYNC_STATE_MISSING); 1357 1358 var sql = "SELECT COUNT(*) FROM fulltextItems WHERE " 1359 + "(indexedPages IS NOT NULL AND indexedPages<totalPages) OR " 1360 + "(indexedChars IS NOT NULL AND indexedChars<totalChars)" 1361 var partial = yield Zotero.DB.valueQueryAsync(sql); 1362 1363 var sql = "SELECT COUNT(*) FROM itemAttachments WHERE itemID NOT IN " 1364 + "(SELECT itemID FROM fulltextItems WHERE synced != ? AND " 1365 + "(indexedPages IS NOT NULL OR indexedChars IS NOT NULL))"; 1366 var unindexed = yield Zotero.DB.valueQueryAsync(sql, this.SYNC_STATE_MISSING); 1367 1368 var sql = "SELECT COUNT(*) FROM fulltextWords"; 1369 var words = yield Zotero.DB.valueQueryAsync(sql); 1370 1371 return { indexed, partial, unindexed, words }; 1372 }); 1373 1374 1375 this.getItemCacheFile = function (item) { 1376 var cacheFile = Zotero.Attachments.getStorageDirectory(item); 1377 cacheFile.append(this.pdfConverterCacheFile); 1378 return cacheFile; 1379 } 1380 1381 1382 this.getItemProcessorCacheFile = function (item) { 1383 var cacheFile = Zotero.Attachments.getStorageDirectory(item); 1384 cacheFile.append(_processorCacheFile); 1385 return cacheFile; 1386 } 1387 1388 1389 /* 1390 * Returns true if an item can be reindexed 1391 * 1392 * Item must be a non-web-link attachment that isn't already fully indexed 1393 */ 1394 this.canReindex = Zotero.Promise.coroutine(function* (item) { 1395 if (item.isAttachment() 1396 && item.attachmentLinkMode != Zotero.Attachments.LINK_MODE_LINKED_URL) { 1397 let contentType = item.attachmentContentType; 1398 if (!contentType || contentType != 'application/pdf' && !Zotero.MIME.isTextType(contentType)) { 1399 return false; 1400 } 1401 switch (yield this.getIndexedState(item)) { 1402 case this.INDEX_STATE_UNAVAILABLE: 1403 case this.INDEX_STATE_UNINDEXED: 1404 case this.INDEX_STATE_PARTIAL: 1405 case this.INDEX_STATE_QUEUED: 1406 1407 // TODO: automatically reindex already-indexed attachments? 1408 case this.INDEX_STATE_INDEXED: 1409 return true; 1410 } 1411 } 1412 return false; 1413 }); 1414 1415 1416 /** 1417 * @return {Promise} 1418 */ 1419 this.rebuildIndex = Zotero.Promise.coroutine(function* (unindexedOnly) { 1420 // Get all attachments other than web links 1421 var sql = "SELECT itemID FROM itemAttachments WHERE linkMode!=" 1422 + Zotero.Attachments.LINK_MODE_LINKED_URL; 1423 var params = []; 1424 if (unindexedOnly) { 1425 sql += " AND itemID NOT IN (SELECT itemID FROM fulltextItems " 1426 + "WHERE synced != ? AND (indexedChars IS NOT NULL OR indexedPages IS NOT NULL))"; 1427 params.push(this.SYNC_STATE_MISSING); 1428 } 1429 var items = yield Zotero.DB.columnQueryAsync(sql, params); 1430 if (items) { 1431 yield Zotero.DB.executeTransaction(function* () { 1432 yield Zotero.DB.queryAsync( 1433 "DELETE FROM fulltextItemWords WHERE itemID IN (" + sql + ")", params 1434 ); 1435 yield Zotero.DB.queryAsync( 1436 "DELETE FROM fulltextItems WHERE itemID IN (" + sql + ")", params 1437 ); 1438 }); 1439 1440 yield this.indexItems(items, false, true); 1441 } 1442 }); 1443 1444 1445 /** 1446 * Clears full-text word index and all full-text cache files 1447 * 1448 * @return {Promise} 1449 */ 1450 this.clearIndex = function (skipLinkedURLs) { 1451 return Zotero.DB.executeTransaction(function* () { 1452 var sql = "DELETE FROM fulltextItems"; 1453 if (skipLinkedURLs) { 1454 var linkSQL = "SELECT itemID FROM itemAttachments WHERE linkMode =" 1455 + Zotero.Attachments.LINK_MODE_LINKED_URL; 1456 1457 sql += " WHERE itemID NOT IN (" + linkSQL + ")"; 1458 } 1459 yield Zotero.DB.queryAsync(sql); 1460 1461 sql = "DELETE FROM fulltextItemWords"; 1462 if (skipLinkedURLs) { 1463 sql += " WHERE itemID NOT IN (" + linkSQL + ")"; 1464 } 1465 yield Zotero.DB.queryAsync(sql); 1466 1467 if (skipLinkedURLs) { 1468 yield this.purgeUnusedWords(); 1469 } 1470 else { 1471 yield Zotero.DB.queryAsync("DELETE FROM fulltextWords"); 1472 } 1473 1474 yield clearCacheFiles(); 1475 }.bind(this)); 1476 } 1477 1478 1479 /* 1480 * Clears cache file for an item 1481 */ 1482 var clearCacheFile = Zotero.Promise.coroutine(function* (itemID) { 1483 var item = yield Zotero.Items.getAsync(itemID); 1484 if (!item) { 1485 return; 1486 } 1487 1488 if (!item.isAttachment()) { 1489 Zotero.debug("Item " + itemID + " is not an attachment in Zotero.Fulltext.clearCacheFile()"); 1490 return; 1491 } 1492 1493 Zotero.debug('Clearing full-text cache file for item ' + itemID); 1494 var cacheFile = Zotero.Fulltext.getItemCacheFile(item); 1495 if (cacheFile.exists()) { 1496 try { 1497 cacheFile.remove(false); 1498 } 1499 catch (e) { 1500 Zotero.File.checkFileAccessError(e, cacheFile, 'delete'); 1501 } 1502 } 1503 }); 1504 1505 1506 /* 1507 * Clear cache files for all attachments 1508 */ 1509 var clearCacheFiles = Zotero.Promise.coroutine(function* (skipLinkedURLs) { 1510 var sql = "SELECT itemID FROM itemAttachments"; 1511 if (skipLinkedURLs) { 1512 sql += " WHERE linkMode != " + Zotero.Attachments.LINK_MODE_LINKED_URL; 1513 } 1514 var items = yield Zotero.DB.columnQueryAsync(sql); 1515 for (var i=0; i<items.length; i++) { 1516 yield clearCacheFile(items[i]); 1517 } 1518 }); 1519 1520 1521 /* 1522 function clearItemContent(itemID){ 1523 Zotero.DB.query("DELETE FROM fulltextContent WHERE itemID=" + itemID); 1524 } 1525 */ 1526 1527 1528 /** 1529 * @return {Promise} 1530 */ 1531 this.purgeUnusedWords = Zotero.Promise.coroutine(function* () { 1532 if (!Zotero.Prefs.get('purge.fulltext')) { 1533 return; 1534 } 1535 1536 var sql = "DELETE FROM fulltextWords WHERE wordID NOT IN " 1537 + "(SELECT wordID FROM fulltextItemWords)"; 1538 yield Zotero.DB.queryAsync(sql); 1539 1540 Zotero.Prefs.set('purge.fulltext', false) 1541 }); 1542 1543 1544 /** 1545 * Convert HTML to text for an item and cache the result 1546 * 1547 * @return {Promise} 1548 */ 1549 var convertItemHTMLToText = Zotero.Promise.coroutine(function* (itemID, html, maxLength) { 1550 // Split elements to avoid word concatentation 1551 html = html.replace(/>/g, '> '); 1552 1553 var text = HTMLToText(html); 1554 var totalChars = text.length; 1555 1556 if (maxLength) { 1557 text = text.substr(0, maxLength); 1558 } 1559 1560 // Write the converted text to a cache file 1561 var item = yield Zotero.Items.getAsync(itemID); 1562 var cacheFile = Zotero.Fulltext.getItemCacheFile(item).path; 1563 Zotero.debug("Writing converted full-text HTML content to " + cacheFile); 1564 if (!(yield OS.File.exists(OS.Path.dirname(cacheFile)))) { 1565 yield Zotero.Attachments.createDirectoryForItem(item); 1566 } 1567 yield Zotero.File.putContentsAsync(cacheFile, text) 1568 .catch(function (e) { 1569 Zotero.debug(e, 1); 1570 Components.utils.reportError(e); 1571 }); 1572 1573 return { 1574 text: text, 1575 totalChars: totalChars 1576 }; 1577 }); 1578 1579 function HTMLToText(html) { 1580 var nsIFC = Components.classes['@mozilla.org/widget/htmlformatconverter;1'] 1581 .createInstance(Components.interfaces.nsIFormatConverter); 1582 var from = Components.classes['@mozilla.org/supports-string;1'] 1583 .createInstance(Components.interfaces.nsISupportsString); 1584 from.data = html; 1585 var to = { value: null }; 1586 try { 1587 nsIFC.convert('text/html', from, from.toString().length, 'text/unicode', to, {}); 1588 to = to.value.QueryInterface(Components.interfaces.nsISupportsString); 1589 return to.toString(); 1590 } 1591 catch(e) { 1592 Zotero.debug(e, 1); 1593 return html; 1594 } 1595 } 1596 1597 1598 /** 1599 * @param {String} text 1600 * @param {String} [charset] 1601 * @return {Array<String>} 1602 */ 1603 this.semanticSplitter = function (text, charset) { 1604 if (!text){ 1605 Zotero.debug('No text to index'); 1606 return []; 1607 } 1608 1609 try { 1610 if (charset && charset != 'utf-8') { 1611 text = this.decoder.convertStringToUTF8(text, charset, true); 1612 } 1613 } catch (err) { 1614 Zotero.debug("Error converting from charset " + charset, 1); 1615 Zotero.debug(err, 1); 1616 } 1617 1618 var words = {}; 1619 var word = ''; 1620 var cclass = null; 1621 var strlen = text.length; 1622 for (var i = 0; i < strlen; i++) { 1623 var charCode = text.charCodeAt(i); 1624 var cc = null; 1625 1626 // Adjustments 1627 if (charCode == 8216 || charCode == 8217) { 1628 // Curly quotes to straight 1629 var c = "'"; 1630 } 1631 else { 1632 var c = text.charAt(i); 1633 } 1634 1635 // Consider single quote in the middle of a word a letter 1636 if (c == "'" && word !== '') { 1637 cc = kWbClassAlphaLetter; 1638 } 1639 1640 if (!cc) { 1641 cc = getClass(c, charCode); 1642 } 1643 1644 // When we reach space or punctuation, store the previous word if there is one 1645 if (cc == kWbClassSpace || cc == kWbClassPunct) { 1646 if (word != '') { 1647 words[word] = true; 1648 word = ''; 1649 } 1650 // When we reach Han character, store previous word and add Han character 1651 } else if (cc == kWbClassHanLetter) { 1652 if (word !== '') { 1653 words[word] = true; 1654 word = ''; 1655 } 1656 words[c] = true; 1657 // Otherwise, if character class hasn't changed, keep adding characters to previous word 1658 } else if (cc == cclass) { 1659 word += c.toLowerCase(); 1660 // If character class is different, store previous word and start new word 1661 } else { 1662 if (word !== '') { 1663 words[word] = true; 1664 } 1665 word = c.toLowerCase(); 1666 } 1667 cclass = cc; 1668 } 1669 if (word !== '') { 1670 words[word] = true; 1671 } 1672 1673 return Object.keys(words).map(function (w) { 1674 // Trim trailing single quotes 1675 if (w.slice(-1) == "'") { 1676 w = w.substr(0, w.length - 1); 1677 } 1678 return w; 1679 }); 1680 } 1681 1682 function _getScriptExtension() { 1683 return Zotero.isWin ? 'vbs' : 'sh'; 1684 } 1685 1686 }