fulltext.js - www - Unnamed repository; edit this file 'description' to name the repository.

fulltext.js (48434B)
      1 /*
      2     ***** BEGIN LICENSE BLOCK *****
      3     
      4     Copyright © 2009 Center for History and New Media
      5                      George Mason University, Fairfax, Virginia, USA
      6                      http://zotero.org
      7     
      8     This file is part of Zotero.
      9     
     10     Zotero is free software: you can redistribute it and/or modify
     11     it under the terms of the GNU Affero General Public License as published by
     12     the Free Software Foundation, either version 3 of the License, or
     13     (at your option) any later version.
     14     
     15     Zotero is distributed in the hope that it will be useful,
     16     but WITHOUT ANY WARRANTY; without even the implied warranty of
     17     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     18     GNU Affero General Public License for more details.
     19     
     20     You should have received a copy of the GNU Affero General Public License
     21     along with Zotero.  If not, see <http://www.gnu.org/licenses/>.
     22     
     23     ***** END LICENSE BLOCK *****
     24 */
     25 
     26 Zotero.Fulltext = Zotero.FullText = new function(){
     27 	this.isCachedMIMEType = isCachedMIMEType;
     28 	
     29 	this.__defineGetter__("pdfConverterCacheFile", function () { return '.zotero-ft-cache'; });
     30 	this.__defineGetter__("pdfInfoCacheFile", function () { return '.zotero-ft-info'; });
     31 	
     32 	this.INDEX_STATE_UNAVAILABLE = 0;
     33 	this.INDEX_STATE_UNINDEXED = 1;
     34 	this.INDEX_STATE_PARTIAL = 2;
     35 	this.INDEX_STATE_INDEXED = 3;
     36 	this.INDEX_STATE_QUEUED = 4;
     37 	
     38 	this.SYNC_STATE_UNSYNCED = 0;
     39 	this.SYNC_STATE_IN_SYNC = 1;
     40 	this.SYNC_STATE_TO_PROCESS = 2;
     41 	this.SYNC_STATE_TO_DOWNLOAD = 3;
     42 	this.SYNC_STATE_MISSING = 4;
     43 	
     44 	const _processorCacheFile = '.zotero-ft-unprocessed';
     45 	
     46 	const kWbClassSpace =            0;
     47 	const kWbClassAlphaLetter =      1;
     48 	const kWbClassPunct =            2;
     49 	const kWbClassHanLetter =        3;
     50 	const kWbClassKatakanaLetter =   4;
     51 	const kWbClassHiraganaLetter =   5;
     52 	const kWbClassHWKatakanaLetter = 6;
     53 	const kWbClassThaiLetter =       7;
     54 	
     55 	var _pdfConverter = null; // nsIFile to executable
     56 	var _pdfInfo = null; // nsIFile to executable
     57 	var _pdfData = null;
     58 	
     59 	var _idleObserverIsRegistered = false;
     60 	var _idleObserverDelay = 30;
     61 	var _processorTimeoutID = null;
     62 	var _processorBlacklist = {};
     63 	var _upgradeCheck = true;
     64 	var _syncLibraryVersion = 0;
     65 	
     66 	this.init = Zotero.Promise.coroutine(function* () {
     67 		yield Zotero.DB.queryAsync("ATTACH ':memory:' AS 'indexing'");
     68 		yield Zotero.DB.queryAsync('CREATE TABLE indexing.fulltextWords (word NOT NULL)');
     69 		
     70 		this.decoder = Components.classes["@mozilla.org/intl/utf8converterservice;1"].
     71 			getService(Components.interfaces.nsIUTF8ConverterService);
     72 		
     73 		let pdfConverterFileName = "pdftotext";
     74 		let pdfInfoFileName = "pdfinfo";
     75 		
     76 		if (Zotero.isWin) {
     77 			pdfConverterFileName += '.exe';
     78 			pdfInfoFileName += '.exe';
     79 		}
     80 		
     81 		let dir = FileUtils.getDir('AChrom', []).parent;
     82 		
     83 		_pdfData = dir.clone();
     84 		_pdfData.append('poppler-data');
     85 		_pdfData = _pdfData.path;
     86 		
     87 		_pdfConverter = dir.clone();
     88 		_pdfInfo = dir.clone();
     89 		
     90 		if(Zotero.isMac) {
     91 			_pdfConverter = _pdfConverter.parent;
     92 			_pdfConverter.append('MacOS');
     93 			
     94 			_pdfInfo = _pdfInfo.parent;
     95 			_pdfInfo.append('MacOS');
     96 		}
     97 
     98 		_pdfConverter.append(pdfConverterFileName);
     99 		_pdfInfo.append(pdfInfoFileName);
    100 		
    101 		Zotero.uiReadyPromise.delay(30000).then(() => {
    102 			this.registerContentProcessor();
    103 			Zotero.addShutdownListener(this.unregisterContentProcessor.bind(this));
    104 			
    105 			// Start/stop content processor with full-text content syncing pref
    106 			Zotero.Prefs.registerObserver('sync.fulltext.enabled', (enabled) => {
    107 				if (enabled) {
    108 					this.registerContentProcessor();
    109 				}
    110 				else {
    111 					this.unregisterContentProcessor();
    112 				}
    113 			});
    114 			
    115 			// Stop content processor during syncs
    116 			Zotero.Notifier.registerObserver(
    117 				{
    118 					notify: Zotero.Promise.method(function (event, type, ids, extraData) {
    119 						if (event == 'start') {
    120 							this.unregisterContentProcessor();
    121 						}
    122 						else if (event == 'stop') {
    123 							this.registerContentProcessor();
    124 						}
    125 					}.bind(this))
    126 				},
    127 				['sync'],
    128 				'fulltext'
    129 			);
    130 		});
    131 	});
    132 	
    133 	
    134 	this.setPDFConverterPath = function(path) {
    135 		_pdfConverter = Zotero.File.pathToFile(path);
    136 	};
    137 	
    138 	
    139 	this.setPDFInfoPath = function(path) {
    140 		_pdfInfo = Zotero.File.pathToFile(path);
    141 		
    142 	};
    143 	
    144 	
    145 	this.setPDFDataPath = function(path) {
    146 		_pdfData = path;
    147 	};
    148 	
    149 	
    150 	this.getLibraryVersion = function (libraryID) {
    151 		if (!libraryID) throw new Error("libraryID not provided");
    152 		return Zotero.DB.valueQueryAsync(
    153 			"SELECT version FROM version WHERE schema=?", "fulltext_" + libraryID
    154 		)
    155 	};
    156 	
    157 	
    158 	this.setLibraryVersion = Zotero.Promise.coroutine(function* (libraryID, version) {
    159 		if (!libraryID) throw new Error("libraryID not provided");
    160 		yield Zotero.DB.queryAsync(
    161 			"REPLACE INTO version VALUES (?, ?)", ["fulltext_" + libraryID, version]
    162 		);
    163 	});
    164 	
    165 	
    166 	this.clearLibraryVersion = function (libraryID) {
    167 		return Zotero.DB.queryAsync("DELETE FROM version WHERE schema=?", "fulltext_" + libraryID);
    168 	};
    169 	
    170 	
    171 	this.getItemVersion = Zotero.Promise.coroutine(function* (itemID) {
    172 		return Zotero.DB.valueQueryAsync(
    173 			"SELECT version FROM fulltextItems WHERE itemID=?", itemID
    174 		)
    175 	});
    176 	
    177 	
    178 	this.setItemSynced = function (itemID, version) {
    179 		return Zotero.DB.queryAsync(
    180 			"UPDATE fulltextItems SET synced=?, version=? WHERE itemID=?",
    181 			[this.SYNC_STATE_IN_SYNC, version, itemID]
    182 		);
    183 	};
    184 	
    185 	
    186 	// this is a port from http://mxr.mozilla.org/mozilla-central/source/intl/lwbrk/src/nsSampleWordBreaker.cpp to
    187 	// Javascript to avoid the overhead of xpcom calls. The port keeps to the mozilla naming of interfaces/constants as
    188 	// closely as possible.
    189 	function getClass(c, cc) {
    190 		if (cc < 0x2E80) { //alphabetical script
    191 			if ((cc & 0xFF80) == 0) { // ascii
    192 				if (c == ' '  || c == "\t" || c == "\r" || c == "\n") { return kWbClassSpace; }
    193 				if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { return kWbClassAlphaLetter; }
    194 				return kWbClassPunct;
    195 			}
    196 			if ((0xFF80 & cc) == 0x0E00) { return kWbClassThaiLetter; }
    197 			if (cc == 0x00A0/*NBSP*/) { return kWbClassSpace; }
    198 			
    199 			// General and Supplemental Unicode punctuation
    200 			if ((cc >= 0x2000 && cc <= 0x206f) || (cc >= 0x2e00 && cc <= 0x2e7f)) { return kWbClassPunct; }
    201 			
    202 			return kWbClassAlphaLetter;
    203 		}
    204 
    205 		if ((cc >= 0x3400 && cc <= 0x9fff) || (cc>= 0xf900 && cc <= 0xfaff)) /*han*/ { return kWbClassHanLetter; }
    206 		if (cc >= 0x30A0 && cc <= 0x30FF) { return kWbClassKatakanaLetter; }
    207 		if (cc >= 0x3040 && cc <= 0x309F) { return kWbClassHiraganaLetter; }
    208 		if (cc>= 0xFF60 && cc <= 0xFF9F) { return kWbClassHWKatakanaLetter; }
    209 		return kWbClassAlphaLetter;
    210 	}
    211 	
    212 	
    213 	this.getPDFConverterExecAndArgs = function () {
    214 		return {
    215 			exec: _pdfConverter,
    216 			args: ['-datadir', _pdfData]
    217 		}
    218 	};
    219 	
    220 	
    221 	/*
    222 	 * Returns true if MIME type is converted to text and cached before indexing
    223 	 *   (e.g. application/pdf is run through pdftotext)
    224 	 */
    225 	function isCachedMIMEType(mimeType) {
    226 		switch (mimeType) {
    227 			case 'application/pdf':
    228 				return true;
    229 		}
    230 		return false;
    231 	}
    232 	
    233 	
    234 	/**
    235 	 * Index multiple words at once
    236 	 *
    237 	 * @requireTransaction
    238 	 * @param {Number} itemID
    239 	 * @param {Array<string>} words
    240 	 * @return {Promise}
    241 	 */
    242 	var indexWords = Zotero.Promise.coroutine(function* (itemID, words) {
    243 		Zotero.DB.requireTransaction();
    244 		let chunk;
    245 		yield Zotero.DB.queryAsync("DELETE FROM indexing.fulltextWords");
    246 		while (words.length > 0) {
    247 			chunk = words.splice(0, 100);
    248 			yield Zotero.DB.queryAsync('INSERT INTO indexing.fulltextWords (word) ' + chunk.map(x => 'SELECT ?').join(' UNION '), chunk);
    249 		}
    250 		yield Zotero.DB.queryAsync('INSERT OR IGNORE INTO fulltextWords (word) SELECT word FROM indexing.fulltextWords');
    251 		yield Zotero.DB.queryAsync('DELETE FROM fulltextItemWords WHERE itemID = ?', [itemID]);
    252 		yield Zotero.DB.queryAsync('INSERT OR IGNORE INTO fulltextItemWords (wordID, itemID) SELECT wordID, ? FROM fulltextWords JOIN indexing.fulltextWords USING(word)', [itemID]);
    253 		yield Zotero.DB.queryAsync("REPLACE INTO fulltextItems (itemID, version) VALUES (?,?)", [itemID, 0]);
    254 		yield Zotero.DB.queryAsync("DELETE FROM indexing.fulltextWords");
    255 	});
    256 	
    257 	
    258 	/**
    259 	 * @return {Promise}
    260 	 */
    261 	var indexString = Zotero.Promise.coroutine(function* (text, charset, itemID, stats, version, synced) {
    262 		var words = this.semanticSplitter(text, charset);
    263 		
    264 		while (Zotero.DB.inTransaction()) {
    265 			yield Zotero.DB.waitForTransaction('indexString()');
    266 		}
    267 		
    268 		yield Zotero.DB.executeTransaction(function* () {
    269 			this.clearItemWords(itemID, true);
    270 			yield indexWords(itemID, words, stats, version, synced);
    271 			
    272 			var sql = "UPDATE fulltextItems SET synced=?";
    273 			var params = [synced ? parseInt(synced) : this.SYNC_STATE_UNSYNCED];
    274 			if (stats) {
    275 				for (let stat in stats) {
    276 					sql += ", " + stat + "=?";
    277 					params.push(stats[stat] ? parseInt(stats[stat]) : null);
    278 				}
    279 			}
    280 			if (version) {
    281 				sql += ", version=?";
    282 				params.push(parseInt(version));
    283 			}
    284 			sql += " WHERE itemID=?";
    285 			params.push(itemID);
    286 			yield Zotero.DB.queryAsync(sql, params);
    287 			
    288 			/*
    289 			var sql = "REPLACE INTO fulltextContent (itemID, textContent) VALUES (?,?)";
    290 			Zotero.DB.query(sql, [itemID, {string:text}]);
    291 			*/
    292 			
    293 			Zotero.Notifier.queue('refresh', 'item', itemID);
    294 		}.bind(this));
    295 		
    296 		// If there's a processor cache file, delete it (whether or not we just used it)
    297 		var item = yield Zotero.Items.getAsync(itemID);
    298 		var cacheFile = this.getItemProcessorCacheFile(item);
    299 		if (cacheFile.exists()) {
    300 			cacheFile.remove(false);
    301 		}
    302 	}.bind(this));
    303 	
    304 	
    305 	/**
    306 	 * @param {Document} document
    307 	 * @param {Number} itemID
    308 	 * @return {Promise}
    309 	 */
    310 	this.indexDocument = Zotero.Promise.coroutine(function* (document, itemID) {
    311 		if (!itemID){
    312 			throw ('Item ID not provided to indexDocument()');
    313 		}
    314 		
    315 		Zotero.debug("Indexing document '" + document.title + "'");
    316 		
    317 		if (!Zotero.MIME.isTextType(document.contentType)) {
    318 			Zotero.debug(document.contentType + " document is not text", 2);
    319 			return false;
    320 		}
    321 		
    322 		if (!document.body) {
    323 			Zotero.debug("Cannot index " + document.contentType + " file", 2);
    324 			return false;
    325 		}
    326 		
    327 		if (!document.characterSet){
    328 			Zotero.debug("Text file didn't have charset", 2);
    329 			return false;
    330 		}
    331 		
    332 		var maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
    333 		if (!maxLength) {
    334 			return false;
    335 		}
    336 		var obj = yield convertItemHTMLToText(itemID, document.body.innerHTML, maxLength);
    337 		var text = obj.text;
    338 		var totalChars = obj.totalChars;
    339 		
    340 		if (totalChars > maxLength) {
    341 			Zotero.debug('Only indexing first ' + maxLength + ' characters of item '
    342 				+ itemID + ' in indexDocument()');
    343 		}
    344 		
    345 		yield indexString(text, document.characterSet, itemID);
    346 		yield setChars(itemID, { indexed: text.length, total: totalChars });
    347 	});
    348 	
    349 	
    350 	/**
    351 	 * @param {String} path
    352 	 * @param {Boolean} [complete=FALSE]  Index the file in its entirety, ignoring maxLength
    353 	 */
    354 	var indexFile = Zotero.Promise.coroutine(function* (path, contentType, charset, itemID, complete, isCacheFile) {
    355 		if (!(yield OS.File.exists(path))) {
    356 			Zotero.debug('File not found in indexFile()', 2);
    357 			return false;
    358 		}
    359 		
    360 		if (!contentType) {
    361 			Zotero.debug("Content type not provided in indexFile()", 1);
    362 			return false;
    363 		}
    364 		
    365 		if (!itemID) {
    366 			throw new Error('Item ID not provided');
    367 		}
    368 		
    369 		if (contentType == 'application/pdf') {
    370 			return this.indexPDF(path, itemID, complete);
    371 		}
    372 		
    373 		if (!Zotero.MIME.isTextType(contentType)) {
    374 			Zotero.debug('File is not text in indexFile()', 2);
    375 			return false;
    376 		}
    377 		
    378 		if (!charset) {
    379 			Zotero.logError(`Item ${itemID} didn't have a charset`);
    380 			return false;
    381 		}
    382 		
    383 		var maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
    384 		if (!maxLength) {
    385 			return false;
    386 		}
    387 		if (complete) {
    388 			maxLength = null;
    389 		}
    390 		
    391 		Zotero.debug('Indexing file ' + path);
    392 		var text = yield Zotero.File.getContentsAsync(path, charset);
    393 		var totalChars = text.length;
    394 		if (contentType == 'text/html') {
    395 			let obj = yield convertItemHTMLToText(itemID, text, maxLength);
    396 			text = obj.text;
    397 			totalChars = obj.totalChars;
    398 		}
    399 		else {
    400 			if (maxLength && text.length > maxLength) {
    401 				text = text.substr(0, maxLength);
    402 			}
    403 		}
    404 		
    405 		yield indexString(text, charset, itemID);
    406 		
    407 		// Record the number of characters indexed (unless we're indexing a (PDF) cache file,
    408 		// in which case the stats are coming from elsewhere)
    409 		if (!isCacheFile) {
    410 			yield setChars(itemID, { indexed: text.length, total: totalChars });
    411 		}
    412 		
    413 		return true;
    414 	}.bind(this));
    415 	
    416 	
    417 	/**
    418 	 * Run PDF through pdfinfo and pdftotext to generate .zotero-ft-info
    419 	 * and .zotero-ft-cache, and pass the text file back to indexFile()
    420 	 *
    421 	 * @param {nsIFile} file
    422 	 * @param {Number} itemID
    423 	 * @param {Boolean} [allPages] - If true, index all pages rather than pdfMaxPages
    424 	 * @return {Promise}
    425 	 */
    426 	this.indexPDF = Zotero.Promise.coroutine(function* (filePath, itemID, allPages) {
    427 		var maxPages = Zotero.Prefs.get('fulltext.pdfMaxPages');
    428 		if (maxPages == 0) {
    429 			return false;
    430 		}
    431 		
    432 		var item = yield Zotero.Items.getAsync(itemID);
    433 		var linkMode = item.attachmentLinkMode;
    434 		// If file is stored outside of Zotero, create a directory for the item
    435 		// in the storage directory and save the cache file there
    436 		if (linkMode == Zotero.Attachments.LINK_MODE_LINKED_FILE) {
    437 			var parentDirPath = yield Zotero.Attachments.createDirectoryForItem(item);
    438 		}
    439 		else {
    440 			var parentDirPath = OS.Path.dirname(filePath);
    441 		}
    442 		var infoFilePath = OS.Path.join(parentDirPath, this.pdfInfoCacheFile);
    443 		var cacheFilePath = OS.Path.join(parentDirPath, this.pdfConverterCacheFile);
    444 		
    445 
    446 		var args = [filePath, infoFilePath];
    447 
    448 		try {
    449 			yield Zotero.Utilities.Internal.exec(_pdfInfo, args);
    450 			var totalPages = yield getTotalPagesFromFile(itemID);
    451 		}
    452 		catch (e) {
    453 			Zotero.debug("Error running " + _pdfInfo.path, 1);
    454 			Zotero.logError(e);
    455 		}
    456 
    457 		
    458 		var {exec, args} = this.getPDFConverterExecAndArgs();
    459 		args.push('-nopgbrk');
    460 		
    461 		if (allPages) {
    462 			if (totalPages) {
    463 				var pagesIndexed = totalPages;
    464 			}
    465 		}
    466 		else {
    467 			args.push('-l', maxPages);
    468 			var pagesIndexed = Math.min(maxPages, totalPages);
    469 		}
    470 		args.push(filePath, cacheFilePath);
    471 		
    472 		try {
    473 			yield Zotero.Utilities.Internal.exec(exec, args);
    474 		}
    475 		catch (e) {
    476 			Zotero.debug("Error running " + exec.path, 1);
    477 			Zotero.logError(e);
    478 			return false;
    479 		}
    480 		
    481 		if (!(yield OS.File.exists(cacheFilePath))) {
    482 			let fileName = OS.Path.basename(filePath);
    483 			let msg = fileName + " was not indexed";
    484 			if (!fileName.match(/^[\u0000-\u007F]+$/)) {
    485 				msg += " -- PDFs with filenames containing extended characters cannot currently be indexed due to a Mozilla limitation";
    486 			}
    487 			Zotero.debug(msg, 2);
    488 			Components.utils.reportError(msg);
    489 			return false;
    490 		}
    491 		
    492 		yield indexFile(cacheFilePath, 'text/plain', 'utf-8', itemID, true, true);
    493 		yield setPages(itemID, { indexed: pagesIndexed, total: totalPages });
    494 		
    495 		return true;
    496 	});
    497 	
    498 	
    499 	/**
    500 	 * @param {Integer[]|Integer} items - One or more itemIDs
    501 	 */
    502 	this.indexItems = Zotero.Promise.coroutine(function* (items, complete, ignoreErrors) {
    503 		if (!Array.isArray(items)) {
    504 			items = [items];
    505 		}
    506 		var items = yield Zotero.Items.getAsync(items);
    507 		var found = [];
    508 		
    509 		for (let i=0; i<items.length; i++) {
    510 			let item = items[i];
    511 			if (!item.isAttachment()) {
    512 				continue;
    513 			}
    514 			
    515 			Zotero.debug("Indexing item " + item.libraryKey);
    516 			let itemID = item.id;
    517 			
    518 			var path = yield item.getFilePathAsync();
    519 			if (!path) {
    520 				if (yield OS.File.exists(this.getItemProcessorCacheFile(item).path)) {
    521 					yield Zotero.Fulltext.indexFromProcessorCache(itemID);
    522 				}
    523 				else {
    524 					Zotero.debug("No file to index for item " + item.libraryKey
    525 						+ " in Zotero.FullText.indexItems()");
    526 				}
    527 				continue;
    528 			}
    529 			
    530 			try {
    531 				yield indexFile(path, item.attachmentContentType, item.attachmentCharset, itemID, complete);
    532 			}
    533 			catch (e) {
    534 				if (ignoreErrors) {
    535 					Components.utils.reportError("Error indexing " + path);
    536 					Zotero.logError(e);
    537 				}
    538 				else {
    539 					throw e;
    540 				}
    541 			}
    542 		}
    543 	});
    544 	
    545 	
    546 	// TEMP: Temporary mechanism to serialize indexing of new attachments
    547 	//
    548 	// This should instead save the itemID to a table that's read by the content processor
    549 	var _queue = [];
    550 	var _indexing = false;
    551 	var _nextIndexTime;
    552 	var _indexDelay = 5000;
    553 	var _indexInterval = 500;
    554 	this.queueItem = function (item) {
    555 		// Don't index files in the background during tests
    556 		if (Zotero.test) return;
    557 		
    558 		_queue.push(item.id);
    559 		_nextIndexTime = Date.now() + _indexDelay;
    560 		setTimeout(() => {
    561 			_processNextItem()
    562 		}, _indexDelay);
    563 	};
    564 	
    565 	async function _processNextItem() {
    566 		if (!_queue.length) return;
    567 		// Another _processNextItem() was scheduled
    568 		if (Date.now() < _nextIndexTime) return;
    569 		// If indexing is already running, _processNextItem() will be called when it's done
    570 		if (_indexing) return;
    571 		_indexing = true;
    572 		var itemID = _queue.shift();
    573 		try {
    574 			await Zotero.Fulltext.indexItems([itemID], false, true);
    575 		}
    576 		finally {
    577 			_indexing = false;
    578 		}
    579 		setTimeout(() => {
    580 			_processNextItem();
    581 		}, _indexInterval);
    582 	};
    583 	
    584 	
    585 	//
    586 	// Full-text content syncing
    587 	//
    588 	/**
    589 	 * Get content and stats that haven't yet been synced
    590 	 *
    591 	 * @param {Integer} libraryID
    592 	 * @param {Integer} [options]
    593 	 * @param {Integer} [options.maxSize]
    594 	 * @param {Integer} [options.maxItems]
    595 	 * @param {Integer} [options.lastItemID] - Only return content for items above this id
    596 	 * @return {Promise<Array<Object>>}
    597 	 */
    598 	this.getUnsyncedContent = Zotero.Promise.coroutine(function* (libraryID, options = {}) {
    599 		var contentItems = [];
    600 		var sql = "SELECT itemID, indexedChars, totalChars, indexedPages, totalPages "
    601 			+ "FROM fulltextItems FI JOIN items I USING (itemID) WHERE libraryID=? AND "
    602 			+ "FI.synced=? AND I.synced=1 ";
    603 		var params = [libraryID, this.SYNC_STATE_UNSYNCED];
    604 		if (options.lastItemID) {
    605 			sql += "AND itemID>?";
    606 			params.push(options.lastItemID);
    607 		}
    608 		sql += "ORDER BY itemID";
    609 		var rows = yield Zotero.DB.queryAsync(sql, params);
    610 		var contentSize = 0;
    611 		for (let i = 0; i < rows.length; i++) {
    612 			let row = rows[i];
    613 			let content;
    614 			let itemID = row.itemID;
    615 			let item = yield Zotero.Items.getAsync(itemID);
    616 			let libraryKey = item.libraryKey;
    617 			let contentType = item.attachmentContentType;
    618 			if (contentType && (isCachedMIMEType(contentType) || Zotero.MIME.isTextType(contentType))) {
    619 				try {
    620 					let cacheFile = this.getItemCacheFile(item).path;
    621 					if (yield OS.File.exists(cacheFile)) {
    622 						Zotero.debug("Getting full-text content from cache "
    623 							+ "file for item " + libraryKey);
    624 						content = yield Zotero.File.getContentsAsync(cacheFile);
    625 					}
    626 					else {
    627 						// If there should be a cache file and isn't, mark the full text as missing
    628 						if (!Zotero.MIME.isTextType(contentType)) {
    629 							Zotero.debug("Full-text content cache file doesn't exist for item "
    630 								+ libraryKey, 2);
    631 							let sql = "UPDATE fulltextItems SET synced=? WHERE itemID=?";
    632 							yield Zotero.DB.queryAsync(sql, [this.SYNC_STATE_MISSING, item.id]);
    633 							continue;
    634 						}
    635 						
    636 						// Same for missing attachments
    637 						let path = yield item.getFilePathAsync();
    638 						if (!path) {
    639 							Zotero.debug("File doesn't exist getting full-text content for item "
    640 								+ libraryKey, 2);
    641 							let sql = "UPDATE fulltextItems SET synced=? WHERE itemID=?";
    642 							yield Zotero.DB.queryAsync(sql, [this.SYNC_STATE_MISSING, item.id]);
    643 							continue;
    644 						}
    645 						
    646 						Zotero.debug("Getting full-text content from file for item " + libraryKey);
    647 						content = yield Zotero.File.getContentsAsync(path, item.attachmentCharset);
    648 						
    649 						// If HTML, convert to plain text first, and cache the result
    650 						if (item.attachmentContentType == 'text/html') {
    651 							let obj = yield convertItemHTMLToText(
    652 								itemID,
    653 								content,
    654 								// Include in the cache file only as many characters as we
    655 								// indexed previously
    656 								row.indexedChars
    657 							);
    658 							content = obj.text;
    659 						}
    660 						else {
    661 							// Include only as many characters as we've indexed
    662 							content = content.substr(0, row.indexedChars);
    663 						}
    664 					}
    665 				}
    666 				catch (e) {
    667 					Zotero.logError(e);
    668 					continue;
    669 				}
    670 			}
    671 			else {
    672 				Zotero.debug("Skipping non-text file getting full-text content for item "
    673 					+ `${libraryKey} (contentType: ${contentType})`, 2);
    674 				
    675 				// Delete rows for items that weren't supposed to be indexed
    676 				yield Zotero.DB.executeTransaction(function* () {
    677 					yield this.clearItemWords(itemID);
    678 				}.bind(this));
    679 				continue;
    680 			}
    681 			
    682 			// If this isn't the first item and it would put us over the size limit, stop
    683 			if (contentItems.length && options.maxSize && contentSize + content.length > options.maxSize) {
    684 				break;
    685 			}
    686 			
    687 			contentItems.push({
    688 				itemID: item.id,
    689 				key: item.key,
    690 				content,
    691 				indexedChars: row.indexedChars ? row.indexedChars : 0,
    692 				totalChars: row.totalChars ? row.totalChars : 0,
    693 				indexedPages: row.indexedPages ? row.indexedPages : 0,
    694 				totalPages: row.totalPages ? row.totalPages : 0
    695 			});
    696 			
    697 			if (options.maxItems && contentItems.length >= options.maxItems) {
    698 				break;
    699 			}
    700 			contentSize += content.length;
    701 		}
    702 		return contentItems;
    703 	});
    704 	
    705 	
    706 	/**
    707 	 * @return {String}  PHP-formatted POST data for items not yet downloaded
    708 	 */
    709 	this.getUndownloadedPostData = Zotero.Promise.coroutine(function* () {
    710 		// TODO: Redo for API syncing
    711 		
    712 		// On upgrade, get all content
    713 		var sql = "SELECT value FROM settings WHERE setting='fulltext' AND key='downloadAll'";
    714 		if (yield Zotero.DB.valueQueryAsync(sql)) {
    715 			return "&ftkeys=all";
    716 		}
    717 		
    718 		var sql = "SELECT itemID FROM fulltextItems WHERE synced=" + this.SYNC_STATE_TO_DOWNLOAD;
    719 		var itemIDs = yield Zotero.DB.columnQueryAsync(sql);
    720 		if (!itemIDs) {
    721 			return "";
    722 		}
    723 		var undownloaded = {};
    724 		for (let i=0; i<itemIDs.length; i++) {
    725 			let itemID = itemIDs[i];
    726 			let item = yield Zotero.Items.getAsync(itemID);
    727 			let libraryID = item.libraryID
    728 			if (!undownloaded[libraryID]) {
    729 				undownloaded[libraryID] = [];
    730 			}
    731 			undownloaded[libraryID].push(item.key);
    732 		}
    733 		var data = "";
    734 		for (let libraryID in undownloaded) {
    735 			for (let i = 0; i < undownloaded[libraryID].length; i++) {
    736 				data += "&" + encodeURIComponent("ftkeys[" + libraryID + "][" + i + "]")
    737 					+ "=" + undownloaded[libraryID][i];
    738 			}
    739 		}
    740 		return data;
    741 	});
    742 	
    743 	
    744 	/**
    745 	 * Save full-text content and stats to a cache file
    746 	 *
    747 	 * @param {Integer} libraryID
    748 	 * @param {String} key - Item key
    749 	 * @param {Object} data
    750 	 * @param {String} data.content
    751 	 * @param {Integer} [data.indexedChars]
    752 	 * @param {Integer} [data.totalChars]
    753 	 * @param {Integer} [data.indexedPages]
    754 	 * @param {Integer} [data.totalPages]
    755 	 * @param {Integer} version
    756 	 * @return {Promise}
    757 	 */
    758 	this.setItemContent = Zotero.Promise.coroutine(function* (libraryID, key, data, version) {
    759 		var libraryKey = libraryID + "/" + key;
    760 		var item = Zotero.Items.getByLibraryAndKey(libraryID, key);
    761 		if (!item) {
    762 			let msg = "Item " + libraryKey + " not found setting full-text content";
    763 			Zotero.logError(msg);
    764 			return;
    765 		}
    766 		var itemID = item.id;
    767 		var currentVersion = yield this.getItemVersion(itemID)
    768 		
    769 		var processorCacheFile = this.getItemProcessorCacheFile(item).path; // .zotero-ft-unprocessed
    770 		var itemCacheFile = this.getItemCacheFile(item).path; // .zotero-ft-cache
    771 		
    772 		// If a storage directory doesn't exist, create it
    773 		if (!(yield OS.File.exists(OS.Path.dirname(processorCacheFile)))) {
    774 			yield Zotero.Attachments.createDirectoryForItem(item);
    775 		}
    776 		
    777 		// If indexed previously and the existing extracted text matches the new text,
    778 		// just update the version
    779 		if (currentVersion !== false
    780 				&& (yield OS.File.exists(itemCacheFile))
    781 				&& (yield Zotero.File.getContentsAsync(itemCacheFile)) == data.content) {
    782 			Zotero.debug("Current full-text content matches remote for item "
    783 				+ libraryKey + " -- updating version");
    784 			return Zotero.DB.queryAsync(
    785 				"REPLACE INTO fulltextItems (itemID, version, synced) VALUES (?, ?, ?)",
    786 				[itemID, version, this.SYNC_STATE_IN_SYNC]
    787 			);
    788 		}
    789 		
    790 		// Otherwise save data to -unprocessed file
    791 		Zotero.debug("Writing full-text content and data for item " + libraryKey
    792 			+ " to " + processorCacheFile);
    793 		yield Zotero.File.putContentsAsync(processorCacheFile, JSON.stringify({
    794 			indexedChars: data.indexedChars,
    795 			totalChars: data.totalChars,
    796 			indexedPages: data.indexedPages,
    797 			totalPages: data.totalPages,
    798 			version,
    799 			text: data.content
    800 		}));
    801 		var synced = this.SYNC_STATE_TO_PROCESS;
    802 		
    803 		// If indexed previously, update the sync state
    804 		if (currentVersion !== false) {
    805 			yield Zotero.DB.queryAsync("UPDATE fulltextItems SET synced=? WHERE itemID=?", [synced, itemID]);
    806 		}
    807 		// If not yet indexed, add an empty row
    808 		else {
    809 			yield Zotero.DB.queryAsync(
    810 				"REPLACE INTO fulltextItems (itemID, version, synced) VALUES (?, 0, ?)",
    811 				[itemID, synced]
    812 			);
    813 		}
    814 		
    815 		this.registerContentProcessor();
    816 	});
    817 	
    818 	
    819 	/**
    820 	 * Start the idle observer for the background content processor
    821 	 */
    822 	this.registerContentProcessor = function () {
    823 		// Don't start idle observer during tests
    824 		if (Zotero.test) return;
    825 		if (!Zotero.Prefs.get('sync.fulltext.enabled')) return;
    826 		
    827 		if (!_idleObserverIsRegistered) {
    828 			Zotero.debug("Starting full-text content processor");
    829 			var idleService = Components.classes["@mozilla.org/widget/idleservice;1"]
    830 					.getService(Components.interfaces.nsIIdleService);
    831 			idleService.addIdleObserver(this.idleObserver, _idleObserverDelay);
    832 			_idleObserverIsRegistered = true;
    833 		}
    834 	}
    835 	
    836 	
    837 	this.unregisterContentProcessor = function () {
    838 		if (_idleObserverIsRegistered) {
    839 			Zotero.debug("Unregistering full-text content processor idle observer");
    840 			var idleService = Components.classes["@mozilla.org/widget/idleservice;1"]
    841 				.getService(Components.interfaces.nsIIdleService);
    842 			idleService.removeIdleObserver(this.idleObserver, _idleObserverDelay);
    843 			_idleObserverIsRegistered = false;
    844 		}
    845 		
    846 		this.stopContentProcessor();
    847 	}
    848 	
    849 	
    850 	/**
    851 	 * Stop the idle observer and a running timer, if there is one
    852 	 */
    853 	this.stopContentProcessor = function () {
    854 		Zotero.debug("Stopping full-text content processor");
    855 		if (_processorTimeoutID) {
    856 			clearTimeout(_processorTimeoutID);
    857 			_processorTimeoutID = null;
    858 		}
    859 	}
    860 	
    861 	/**
    862 	 * Find items marked as having unprocessed cache files, run cache file processing on one item, and
    863 	 * after a short delay call self again with the remaining items
    864 	 *
    865 	 * @param {Array<Integer>} itemIDs  An array of itemIDs to process; if this
    866 	 *                                  is omitted, a database query is made
    867 	 *                                  to find unprocessed content
    868 	 * @return {Boolean}  TRUE if there's more content to process; FALSE otherwise
    869 	 */
    870 	this.processUnprocessedContent = Zotero.Promise.coroutine(function* (itemIDs) {
    871 		// Idle observer can take a little while to trigger and may not cancel the setTimeout()
    872 		// in time, so check idle time directly
    873 		var idleService = Components.classes["@mozilla.org/widget/idleservice;1"]
    874 			.getService(Components.interfaces.nsIIdleService);
    875 		if (idleService.idleTime < _idleObserverDelay * 1000) {
    876 			return;
    877 		}
    878 		
    879 		if (!itemIDs) {
    880 			Zotero.debug("Checking for unprocessed full-text content");
    881 			let sql = "SELECT itemID FROM fulltextItems WHERE synced=" + this.SYNC_STATE_TO_PROCESS;
    882 			itemIDs = yield Zotero.DB.columnQueryAsync(sql);
    883 		}
    884 		
    885 		var origLen = itemIDs.length;
    886 		itemIDs = itemIDs.filter(function (id) {
    887 			return !(id in _processorBlacklist);
    888 		});
    889 		if (itemIDs.length < origLen) {
    890 			let skipped = (origLen - itemIDs.length);
    891 			Zotero.debug("Skipping large full-text content for " + skipped
    892 				+ " item" + (skipped == 1 ? '' : 's'));
    893 		}
    894 		
    895 		// If there's no more unprocessed content, stop the idle observer
    896 		if (!itemIDs.length) {
    897 			Zotero.debug("No unprocessed full-text content found");
    898 			this.unregisterContentProcessor();
    899 			return;
    900 		}
    901 		
    902 		let itemID = itemIDs.shift();
    903 		let item = yield Zotero.Items.getAsync(itemID);
    904 		
    905 		Zotero.debug("Processing full-text content for item " + item.libraryKey);
    906 		
    907 		yield Zotero.Fulltext.indexFromProcessorCache(itemID);
    908 		
    909 		if (!itemIDs.length || idleService.idleTime < _idleObserverDelay * 1000) {
    910 			return;
    911 		}
    912 		
    913 		// If there are remaining items, call self again after a short delay. The delay allows
    914 		// for processing to be interrupted if the user returns from idle. At least on macOS,
    915 		// when Zotero is in the background this can be throttled to 10 seconds.
    916 		_processorTimeoutID = setTimeout(() => this.processUnprocessedContent(itemIDs), 200);
    917 	});
    918 	
    919 	this.idleObserver = {
    920 		observe: function (subject, topic, data) {
    921 			// On idle, start the background processor
    922 			if (topic == 'idle') {
    923 				this.processUnprocessedContent();
    924 			}
    925 			// When back from idle, stop the processor (but keep the idle observer registered)
    926 			else if (topic == 'active') {
    927 				this.stopContentProcessor();
    928 			}
    929 		}.bind(this)
    930 	};
    931 	
    932 	
    933 	/**
    934 	 * @param {Number} itemID
    935 	 * @return {Promise<Boolean>}
    936 	 */
    937 	this.indexFromProcessorCache = Zotero.Promise.coroutine(function* (itemID) {
    938 		try {
    939 			var item = yield Zotero.Items.getAsync(itemID);
    940 			var cacheFile = this.getItemProcessorCacheFile(item).path;
    941 			if (!(yield OS.File.exists(cacheFile)))  {
    942 				Zotero.debug("Full-text content processor cache file doesn't exist for item " + itemID);
    943 				yield Zotero.DB.queryAsync(
    944 					"UPDATE fulltextItems SET synced=? WHERE itemID=?",
    945 					[this.SYNC_STATE_UNSYNCED, itemID]
    946 				);
    947 				return false;
    948 			}
    949 			
    950 			var json = yield Zotero.File.getContentsAsync(cacheFile);
    951 			var data = JSON.parse(json);
    952 			
    953 			// Write the text content to the regular cache file
    954 			var item = yield Zotero.Items.getAsync(itemID);
    955 			cacheFile = this.getItemCacheFile(item).path;
    956 			Zotero.debug("Writing full-text content to " + cacheFile);
    957 			yield Zotero.File.putContentsAsync(cacheFile, data.text);
    958 			
    959 			yield indexString(
    960 				data.text,
    961 				"UTF-8",
    962 				itemID,
    963 				{
    964 					indexedChars: data.indexedChars,
    965 					totalChars: data.totalChars,
    966 					indexedPages: data.indexedPages,
    967 					totalPages: data.totalPages
    968 				},
    969 				data.version,
    970 				1
    971 			);
    972 			
    973 			return true;
    974 		}
    975 		catch (e) {
    976 			Components.utils.reportError(e);
    977 			Zotero.debug(e, 1);
    978 			return false;
    979 		};
    980 	});
    981 	
    982 	//
    983 	// End full-text content syncing
    984 	//
    985 	
    986 	
    987 	/*
    988 	 * Scan a string for another string
    989 	 *
    990 	 * _items_ -- one or more attachment items to search
    991 	 * _searchText_ -- text pattern to search for
    992 	 * _mode_:
    993 	 *    'regexp' -- regular expression (case-insensitive)
    994 	 *    'regexpCS' -- regular expression (case-sensitive)
    995 	 *
    996 	 * - Slashes in regex are optional
    997 	 */
    998 	function findTextInString(content, searchText, mode) {
    999 		switch (mode){
   1000 			case 'regexp':
   1001 			case 'regexpCS':
   1002 			case 'regexpBinary':
   1003 			case 'regexpCSBinary':
   1004 				// Do a multiline search by default
   1005 				var flags = 'm';
   1006 				var parts = searchText.match(/^\/(.*)\/([^\/]*)/);
   1007 				if (parts){
   1008 					searchText = parts[1];
   1009 					// Ignore user-supplied flags
   1010 					//flags = parts[2];
   1011 				}
   1012 				
   1013 				if (mode.indexOf('regexpCS')==-1){
   1014 					flags += 'i';
   1015 				}
   1016 				
   1017 				try {
   1018 					var re = new RegExp(searchText, flags);
   1019 					var matches = re.exec(content);
   1020 				}
   1021 				catch (e) {
   1022 					Zotero.debug(e, 1);
   1023 					Components.utils.reportError(e);
   1024 				}
   1025 				if (matches){
   1026 					Zotero.debug("Text found");
   1027 					return content.substr(matches.index, 50);
   1028 				}
   1029 				
   1030 				break;
   1031 			
   1032 			default:
   1033 				// Case-insensitive
   1034 				searchText = searchText.toLowerCase();
   1035 				content = content.toLowerCase();
   1036 				
   1037 				var pos = content.indexOf(searchText);
   1038 				if (pos!=-1){
   1039 					Zotero.debug('Text found');
   1040 					return content.substr(pos, 50);
   1041 				}
   1042 		}
   1043 		
   1044 		return -1;
   1045 	}
   1046 	
   1047 	/**
   1048 	 * Scan item files for a text string
   1049 	 *
   1050 	 * _items_ -- one or more attachment items to search
   1051 	 * _searchText_ -- text pattern to search for
   1052 	 * _mode_:
   1053 	 *    'phrase'
   1054 	 *    'regexp'
   1055 	 *    'regexpCS' -- case-sensitive regular expression
   1056 	 *
   1057 	 * Note:
   1058 	 *  - Slashes in regex are optional
   1059 	 *  - Add 'Binary' to the mode to search all files, not just text files
   1060 	 *
   1061 	 * @return {Promise<Array<Object>>} A promise for an array of match objects, with 'id' containing
   1062 	 *                                  an itemID and 'match' containing a string snippet
   1063 	 */
   1064 	this.findTextInItems = Zotero.Promise.coroutine(function* (items, searchText, mode){
   1065 		if (!searchText){
   1066 			return [];
   1067 		}
   1068 		
   1069 		var items = yield Zotero.Items.getAsync(items);
   1070 		var found = [];
   1071 		
   1072 		for (let i=0; i<items.length; i++) {
   1073 			let item = items[i];
   1074 			if (!item.isAttachment()) {
   1075 				continue;
   1076 			}
   1077 			
   1078 			let itemID = item.id;
   1079 			let content;
   1080 			let mimeType = item.attachmentContentType;
   1081 			let maxLength = Zotero.Prefs.get('fulltext.textMaxLength');
   1082 			let binaryMode = mode && mode.indexOf('Binary') != -1;
   1083 			
   1084 			if (isCachedMIMEType(mimeType)) {
   1085 				let file = this.getItemCacheFile(item).path;
   1086 				if (!(yield OS.File.exists(file))) {
   1087 					continue;
   1088 				}
   1089 				
   1090 				Zotero.debug("Searching for text '" + searchText + "' in " + file);
   1091 				content = yield Zotero.File.getContentsAsync(file, 'utf-8', maxLength);
   1092 			}
   1093 			else {
   1094 				// If not binary mode, only scan plaintext files
   1095 				if (!binaryMode) {
   1096 					if (!Zotero.MIME.isTextType(mimeType)) {
   1097 						Zotero.debug('Not scanning MIME type ' + mimeType, 4);
   1098 						continue;
   1099 					}
   1100 				}
   1101 				
   1102 				// Check for a cache file
   1103 				let cacheFile = this.getItemCacheFile(item).path;
   1104 				if (yield OS.File.exists(cacheFile)) {
   1105 					Zotero.debug("Searching for text '" + searchText + "' in " + cacheFile);
   1106 					content = yield Zotero.File.getContentsAsync(cacheFile, 'utf-8', maxLength);
   1107 				}
   1108 				else {
   1109 					// If that doesn't exist, check for the actual file
   1110 					let path = yield item.getFilePathAsync();
   1111 					if (!path) {
   1112 						continue;
   1113 					}
   1114 					
   1115 					Zotero.debug("Searching for text '" + searchText + "' in " + path);
   1116 					content = yield Zotero.File.getContentsAsync(path, item.attachmentCharset);
   1117 					
   1118 					// If HTML and not binary mode, convert to text
   1119 					if (mimeType == 'text/html' && !binaryMode) {
   1120 						// Include in the cache file only as many characters as we've indexed
   1121 						let chars = yield getChars(itemID);
   1122 						
   1123 						let obj = yield convertItemHTMLToText(
   1124 							itemID, content, chars ? chars.indexedChars : null
   1125 						);
   1126 						content = obj.text;
   1127 					}
   1128 				}
   1129 			}
   1130 			
   1131 			let match = findTextInString(content, searchText, mode);
   1132 			if (match != -1) {
   1133 				found.push({
   1134 					id: itemID,
   1135 					match: match
   1136 				});
   1137 			}
   1138 		}
   1139 		
   1140 		return found;
   1141 	});
   1142 	
   1143 	
   1144 	/**
   1145 	 * @requireTransaction
   1146 	 */
   1147 	this.clearItemWords = Zotero.Promise.coroutine(function* (itemID, skipCacheClear) {
   1148 		Zotero.DB.requireTransaction();
   1149 		
   1150 		var sql = "SELECT rowid FROM fulltextItems WHERE itemID=? LIMIT 1";
   1151 		var indexed = yield Zotero.DB.valueQueryAsync(sql, itemID);
   1152 		if (indexed) {
   1153 			yield Zotero.DB.queryAsync("DELETE FROM fulltextItemWords WHERE itemID=?", itemID);
   1154 			yield Zotero.DB.queryAsync("DELETE FROM fulltextItems WHERE itemID=?", itemID);
   1155 		}
   1156 		
   1157 		if (indexed) {
   1158 			Zotero.Prefs.set('purge.fulltext', true);
   1159 		}
   1160 		
   1161 		if (!skipCacheClear) {
   1162 			// Delete fulltext cache file if there is one
   1163 			yield clearCacheFile(itemID);
   1164 		}
   1165 	});
   1166 	
   1167 	
   1168 	/**
   1169 	 * @return {Promise}
   1170 	 */
   1171 	this.getPages = function (itemID, force) {
   1172 		var sql = "SELECT indexedPages, totalPages AS total "
   1173 			+ "FROM fulltextItems WHERE itemID=?";
   1174 		return Zotero.DB.rowQueryAsync(sql, itemID);
   1175 	}
   1176 	
   1177 	
   1178 	/**
   1179 	 * Gets the number of pages from the PDF info cache file
   1180 	 *
   1181 	 * @private
   1182 	 * @return {Promise}
   1183 	 */
   1184 	var getTotalPagesFromFile = Zotero.Promise.coroutine(function* (itemID) {
   1185 		var file = OS.Path.join(
   1186 			Zotero.Attachments.getStorageDirectoryByID(itemID).path,
   1187 			Zotero.Fulltext.pdfInfoCacheFile
   1188 		);
   1189 		if (!(yield OS.File.exists(file))) {
   1190 			return false;
   1191 		}
   1192 		var contents = yield Zotero.File.getContentsAsync(file);
   1193 		try {
   1194 			// Parse pdfinfo output
   1195 			var pages = contents.match('Pages:[^0-9]+([0-9]+)')[1];
   1196 		}
   1197 		catch (e) {
   1198 			Zotero.debug(e);
   1199 			return false;
   1200 		}
   1201 		return pages;
   1202 	});
   1203 	
   1204 	
   1205 	/**
   1206 	 * @return {Promise}
   1207 	 */
   1208 	function getChars(itemID) {
   1209 		var sql = "SELECT indexedChars, totalChars AS total "
   1210 			+ "FROM fulltextItems WHERE itemID=?";
   1211 		return Zotero.DB.rowQueryAsync(sql, itemID);
   1212 	}
   1213 	
   1214 	
   1215 	/**
   1216 	 * Gets the number of characters from the PDF converter cache file
   1217 	 *
   1218 	 * @return {Promise}
   1219 	 */
   1220 	var getTotalCharsFromFile = Zotero.Promise.coroutine(function* (itemID) {
   1221 		var item = yield Zotero.Items.getAsync(itemID);
   1222 		switch (item.attachmentContentType) {
   1223 			case 'application/pdf':
   1224 				var file = OS.Path.join(
   1225 					Zotero.Attachments.getStorageDirectory(item).path,
   1226 					this.pdfConverterCacheFile
   1227 				);
   1228 				if (!(yield OS.File.exists(file))) {
   1229 					return false;
   1230 				}
   1231 				break;
   1232 				
   1233 			default:
   1234 				var file = yield item.getFilePathAsync();
   1235 				if (!file) {
   1236 					return false;
   1237 				}
   1238 		}
   1239 		
   1240 		var contents = yield Zotero.File.getContentsAsync(file);
   1241 		return contents.length;
   1242 	});
   1243 	
   1244 	
   1245 	/**
   1246 	 * @return {Promise}
   1247 	 */
   1248 	function setPages(itemID, obj) {
   1249 		var sql = "UPDATE fulltextItems SET indexedPages=?, totalPages=? WHERE itemID=?";
   1250 		return Zotero.DB.queryAsync(
   1251 			sql,
   1252 			[
   1253 				obj.indexed ? parseInt(obj.indexed) : null,
   1254 				obj.total ? parseInt(obj.total) : null,
   1255 				itemID
   1256 			]
   1257 		);
   1258 	}
   1259 	
   1260 	
   1261 	/**
   1262 	 * @param {Number} itemID
   1263 	 * @param {Object} obj
   1264 	 * @return {Promise}
   1265 	 */
   1266 	function setChars(itemID, obj) {
   1267 		var sql = "UPDATE fulltextItems SET indexedChars=?, totalChars=? WHERE itemID=?";
   1268 		return Zotero.DB.queryAsync(
   1269 			sql,
   1270 			[
   1271 				obj.indexed ? parseInt(obj.indexed) : null,
   1272 				obj.total ? parseInt(obj.total) : null,
   1273 				itemID
   1274 			]
   1275 		);
   1276 	}
   1277 	
   1278 	
   1279 	/*
   1280 	 * Gets the indexed state of an item, 
   1281 	 */
   1282 	this.getIndexedState = Zotero.Promise.coroutine(function* (item) {
   1283 		if (!item.isAttachment()) {
   1284 			throw new Error('Item is not an attachment');
   1285 		}
   1286 		
   1287 		// If the file or cache file wasn't available during syncing, mark as unindexed
   1288 		var synced = yield Zotero.DB.valueQueryAsync(
   1289 			"SELECT synced FROM fulltextItems WHERE itemID=?", item.id
   1290 		);
   1291 		if (synced === false || synced == this.SYNC_STATE_MISSING) {
   1292 			return this.INDEX_STATE_UNINDEXED;
   1293 		}
   1294 		
   1295 		var itemID = item.id;
   1296 		var state = this.INDEX_STATE_UNINDEXED;
   1297 		switch (item.attachmentContentType) {
   1298 			// Use pages for PDFs
   1299 			case 'application/pdf':
   1300 				var o = yield this.getPages(itemID);
   1301 				if (o) {
   1302 					var stats = {
   1303 						indexed: o.indexedPages,
   1304 						total: o.total
   1305 					};
   1306 				}
   1307 				break;
   1308 			
   1309 			default:
   1310 				var o = yield getChars(itemID);
   1311 				if (o) {
   1312 					var stats = {
   1313 						indexed: o.indexedChars,
   1314 						total: o.total
   1315 					};
   1316 				}
   1317 		}
   1318 		
   1319 		if (stats) {
   1320 			if (!stats.total && !stats.indexed) {
   1321 				let queued = false;
   1322 				try {
   1323 					queued = yield OS.File.exists(this.getItemProcessorCacheFile(item).path);
   1324 				}
   1325 				catch (e) {
   1326 					Zotero.logError(e);
   1327 				}
   1328 				state = queued ? this.INDEX_STATE_QUEUED : this.INDEX_STATE_UNAVAILABLE;
   1329 			}
   1330 			else if (!stats.indexed) {
   1331 				state = this.INDEX_STATE_UNINDEXED;
   1332 			}
   1333 			else if (stats.indexed < stats.total) {
   1334 				state = this.INDEX_STATE_PARTIAL;
   1335 			}
   1336 			else {
   1337 				state = this.INDEX_STATE_INDEXED;
   1338 			}
   1339 		}
   1340 		return state;
   1341 	});
   1342 	
   1343 	
   1344 	this.isFullyIndexed = Zotero.Promise.coroutine(function* (item) {
   1345 		return (yield this.getIndexedState(item)) == this.INDEX_STATE_INDEXED;
   1346 	});
   1347 	
   1348 	
   1349 	/**
   1350 	 * @return {Promise}
   1351 	 */
   1352 	this.getIndexStats = Zotero.Promise.coroutine(function* () {
   1353 		var sql = "SELECT COUNT(*) FROM fulltextItems WHERE synced != ? AND "
   1354 			+ "((indexedPages IS NOT NULL AND indexedPages=totalPages) OR "
   1355 			+ "(indexedChars IS NOT NULL AND indexedChars=totalChars))"
   1356 		var indexed = yield Zotero.DB.valueQueryAsync(sql, this.SYNC_STATE_MISSING);
   1357 		
   1358 		var sql = "SELECT COUNT(*) FROM fulltextItems WHERE "
   1359 			+ "(indexedPages IS NOT NULL AND indexedPages<totalPages) OR "
   1360 			+ "(indexedChars IS NOT NULL AND indexedChars<totalChars)"
   1361 		var partial = yield Zotero.DB.valueQueryAsync(sql);
   1362 		
   1363 		var sql = "SELECT COUNT(*) FROM itemAttachments WHERE itemID NOT IN "
   1364 			+ "(SELECT itemID FROM fulltextItems WHERE synced != ? AND "
   1365 			+ "(indexedPages IS NOT NULL OR indexedChars IS NOT NULL))";
   1366 		var unindexed = yield Zotero.DB.valueQueryAsync(sql, this.SYNC_STATE_MISSING);
   1367 		
   1368 		var sql = "SELECT COUNT(*) FROM fulltextWords";
   1369 		var words = yield Zotero.DB.valueQueryAsync(sql);
   1370 		
   1371 		return { indexed, partial, unindexed, words };
   1372 	});
   1373 	
   1374 	
   1375 	this.getItemCacheFile = function (item) {
   1376 		var cacheFile = Zotero.Attachments.getStorageDirectory(item);
   1377 		cacheFile.append(this.pdfConverterCacheFile);
   1378 		return cacheFile;
   1379 	}
   1380 	
   1381 	
   1382 	this.getItemProcessorCacheFile = function (item) {
   1383 		var cacheFile = Zotero.Attachments.getStorageDirectory(item);
   1384 		cacheFile.append(_processorCacheFile);
   1385 		return cacheFile;
   1386 	}
   1387 	
   1388 	
   1389 	/*
   1390 	 * Returns true if an item can be reindexed
   1391 	 *
   1392 	 * Item must be a non-web-link attachment that isn't already fully indexed
   1393 	 */
   1394 	this.canReindex = Zotero.Promise.coroutine(function* (item) {
   1395 		if (item.isAttachment()
   1396 				&& item.attachmentLinkMode != Zotero.Attachments.LINK_MODE_LINKED_URL) {
   1397 			let contentType = item.attachmentContentType;
   1398 			if (!contentType || contentType != 'application/pdf' && !Zotero.MIME.isTextType(contentType)) {
   1399 				return false;
   1400 			}
   1401 			switch (yield this.getIndexedState(item)) {
   1402 				case this.INDEX_STATE_UNAVAILABLE:
   1403 				case this.INDEX_STATE_UNINDEXED:
   1404 				case this.INDEX_STATE_PARTIAL:
   1405 				case this.INDEX_STATE_QUEUED:
   1406 				
   1407 				// TODO: automatically reindex already-indexed attachments?
   1408 				case this.INDEX_STATE_INDEXED:
   1409 					return true;
   1410 			}
   1411 		}
   1412 		return false;
   1413 	});
   1414 	
   1415 	
   1416 	/**
   1417 	 * @return {Promise}
   1418 	 */
   1419 	this.rebuildIndex = Zotero.Promise.coroutine(function* (unindexedOnly) {
   1420 		// Get all attachments other than web links
   1421 		var sql = "SELECT itemID FROM itemAttachments WHERE linkMode!="
   1422 			+ Zotero.Attachments.LINK_MODE_LINKED_URL;
   1423 		var params = [];
   1424 		if (unindexedOnly) {
   1425 			sql += " AND itemID NOT IN (SELECT itemID FROM fulltextItems "
   1426 				+ "WHERE synced != ? AND (indexedChars IS NOT NULL OR indexedPages IS NOT NULL))";
   1427 			params.push(this.SYNC_STATE_MISSING);
   1428 		}
   1429 		var items = yield Zotero.DB.columnQueryAsync(sql, params);
   1430 		if (items) {
   1431 			yield Zotero.DB.executeTransaction(function* () {
   1432 				yield Zotero.DB.queryAsync(
   1433 					"DELETE FROM fulltextItemWords WHERE itemID IN (" + sql + ")", params
   1434 				);
   1435 				yield Zotero.DB.queryAsync(
   1436 					"DELETE FROM fulltextItems WHERE itemID IN (" + sql + ")", params
   1437 				);
   1438 			});
   1439 			
   1440 			yield this.indexItems(items, false, true);
   1441 		}
   1442 	});
   1443 	
   1444 	
   1445 	/**
   1446 	 * Clears full-text word index and all full-text cache files
   1447 	 *
   1448 	 * @return {Promise}
   1449 	 */
   1450 	this.clearIndex = function (skipLinkedURLs) {
   1451 		return Zotero.DB.executeTransaction(function* () {
   1452 			var sql = "DELETE FROM fulltextItems";
   1453 			if (skipLinkedURLs) {
   1454 				var linkSQL = "SELECT itemID FROM itemAttachments WHERE linkMode ="
   1455 					+ Zotero.Attachments.LINK_MODE_LINKED_URL;
   1456 				
   1457 				sql += " WHERE itemID NOT IN (" + linkSQL + ")";
   1458 			}
   1459 			yield Zotero.DB.queryAsync(sql);
   1460 			
   1461 			sql = "DELETE FROM fulltextItemWords";
   1462 			if (skipLinkedURLs) {
   1463 				sql += " WHERE itemID NOT IN (" + linkSQL + ")";
   1464 			}
   1465 			yield Zotero.DB.queryAsync(sql);
   1466 			
   1467 			if (skipLinkedURLs) {
   1468 				yield this.purgeUnusedWords();
   1469 			}
   1470 			else {
   1471 				yield Zotero.DB.queryAsync("DELETE FROM fulltextWords");
   1472 			}
   1473 			
   1474 			yield clearCacheFiles();
   1475 		}.bind(this));
   1476 	}
   1477 	
   1478 	
   1479 	/*
   1480 	 * Clears cache file for an item
   1481 	 */
   1482 	var clearCacheFile = Zotero.Promise.coroutine(function* (itemID) {
   1483 		var item = yield Zotero.Items.getAsync(itemID);
   1484 		if (!item) {
   1485 			return;
   1486 		}
   1487 		
   1488 		if (!item.isAttachment()) {
   1489 			Zotero.debug("Item " + itemID + " is not an attachment in Zotero.Fulltext.clearCacheFile()");
   1490 			return;
   1491 		}
   1492 		
   1493 		Zotero.debug('Clearing full-text cache file for item ' + itemID);
   1494 		var cacheFile = Zotero.Fulltext.getItemCacheFile(item);
   1495 		if (cacheFile.exists()) {
   1496 			try {
   1497 				cacheFile.remove(false);
   1498 			}
   1499 			catch (e) {
   1500 				Zotero.File.checkFileAccessError(e, cacheFile, 'delete');
   1501 			}
   1502 		}
   1503 	});
   1504 	
   1505 	
   1506 	/*
   1507 	 * Clear cache files for all attachments
   1508 	 */
   1509 	var clearCacheFiles = Zotero.Promise.coroutine(function* (skipLinkedURLs) {
   1510 		var sql = "SELECT itemID FROM itemAttachments";
   1511 		if (skipLinkedURLs) {
   1512 			sql += " WHERE linkMode != " + Zotero.Attachments.LINK_MODE_LINKED_URL;
   1513 		}
   1514 		var items = yield Zotero.DB.columnQueryAsync(sql);
   1515 		for (var i=0; i<items.length; i++) {
   1516 			yield clearCacheFile(items[i]);
   1517 		}
   1518 	});
   1519 	
   1520 	
   1521 	/*
   1522 	function clearItemContent(itemID){
   1523 		Zotero.DB.query("DELETE FROM fulltextContent WHERE itemID=" + itemID);
   1524 	}
   1525 	*/
   1526 	
   1527 	
   1528 	/**
   1529 	 * @return {Promise}
   1530 	 */
   1531 	this.purgeUnusedWords = Zotero.Promise.coroutine(function* () {
   1532 		if (!Zotero.Prefs.get('purge.fulltext')) {
   1533 			return;
   1534 		}
   1535 		
   1536 		var sql = "DELETE FROM fulltextWords WHERE wordID NOT IN "
   1537 					+ "(SELECT wordID FROM fulltextItemWords)";
   1538 		yield Zotero.DB.queryAsync(sql);
   1539 		
   1540 		Zotero.Prefs.set('purge.fulltext', false)
   1541 	});
   1542 	
   1543 	
   1544 	/**
   1545 	 * Convert HTML to text for an item and cache the result
   1546 	 *
   1547 	 * @return {Promise}
   1548 	 */
   1549 	var convertItemHTMLToText = Zotero.Promise.coroutine(function* (itemID, html, maxLength) {
   1550 		// Split elements to avoid word concatentation
   1551 		html = html.replace(/>/g, '> ');
   1552 		
   1553 		var text = HTMLToText(html);
   1554 		var totalChars = text.length;
   1555 		
   1556 		if (maxLength) {
   1557 			text = text.substr(0, maxLength);
   1558 		}
   1559 		
   1560 		// Write the converted text to a cache file
   1561 		var item = yield Zotero.Items.getAsync(itemID);
   1562 		var cacheFile = Zotero.Fulltext.getItemCacheFile(item).path;
   1563 		Zotero.debug("Writing converted full-text HTML content to " + cacheFile);
   1564 		if (!(yield OS.File.exists(OS.Path.dirname(cacheFile)))) {
   1565 			yield Zotero.Attachments.createDirectoryForItem(item);
   1566 		}
   1567 		yield Zotero.File.putContentsAsync(cacheFile, text)
   1568 		.catch(function (e) {
   1569 			Zotero.debug(e, 1);
   1570 			Components.utils.reportError(e);
   1571 		});
   1572 		
   1573 		return {
   1574 			text: text,
   1575 			totalChars: totalChars
   1576 		};
   1577 	});
   1578 	
   1579 	function HTMLToText(html) {
   1580 		var	nsIFC = Components.classes['@mozilla.org/widget/htmlformatconverter;1']
   1581 			.createInstance(Components.interfaces.nsIFormatConverter);
   1582 		var from = Components.classes['@mozilla.org/supports-string;1']
   1583 			.createInstance(Components.interfaces.nsISupportsString);
   1584 		from.data = html;
   1585 		var to = { value: null };
   1586 		try {
   1587 			nsIFC.convert('text/html', from, from.toString().length, 'text/unicode', to, {});
   1588 			to = to.value.QueryInterface(Components.interfaces.nsISupportsString);
   1589 			return to.toString();
   1590 		}
   1591 		catch(e) {
   1592 			Zotero.debug(e, 1);
   1593 			return html;
   1594 		}
   1595 	}
   1596 	
   1597 	
   1598 	/**
   1599 	 * @param {String} text
   1600 	 * @param {String} [charset]
   1601 	 * @return {Array<String>}
   1602 	 */
   1603 	this.semanticSplitter = function (text, charset) {
   1604 		if (!text){
   1605 			Zotero.debug('No text to index');
   1606 			return [];
   1607 		}
   1608 		
   1609 		try {
   1610 			if (charset && charset != 'utf-8') {
   1611 				text = this.decoder.convertStringToUTF8(text, charset, true);
   1612 			}
   1613 		} catch (err) {
   1614 			Zotero.debug("Error converting from charset " + charset, 1);
   1615 			Zotero.debug(err, 1);
   1616 		}
   1617 		
   1618 		var words = {};
   1619 		var word = '';
   1620 		var cclass = null;
   1621 		var strlen = text.length;
   1622 		for (var i = 0; i < strlen; i++) {
   1623 			var charCode = text.charCodeAt(i);
   1624 			var cc = null;
   1625 			
   1626 			// Adjustments
   1627 			if (charCode == 8216 || charCode == 8217) {
   1628 				// Curly quotes to straight
   1629 				var c = "'";
   1630 			}
   1631 			else {
   1632 				var c = text.charAt(i);
   1633 			}
   1634 			
   1635 			// Consider single quote in the middle of a word a letter
   1636 			if (c == "'" && word !== '') {
   1637 				cc = kWbClassAlphaLetter;
   1638 			}
   1639 			
   1640 			if (!cc) {
   1641 				cc = getClass(c, charCode);
   1642 			}
   1643 			
   1644 			// When we reach space or punctuation, store the previous word if there is one
   1645 			if (cc == kWbClassSpace || cc == kWbClassPunct) {
   1646 				if (word != '') {
   1647 					words[word] = true;
   1648 					word = '';
   1649 				}
   1650 			// When we reach Han character, store previous word and add Han character
   1651 			} else if (cc == kWbClassHanLetter) {
   1652 				if (word !== '') {
   1653 					words[word] = true;
   1654 					word = '';
   1655 				}
   1656 				words[c] = true;
   1657 			// Otherwise, if character class hasn't changed, keep adding characters to previous word
   1658 			} else if (cc == cclass) {
   1659 				word += c.toLowerCase();
   1660 			// If character class is different, store previous word and start new word
   1661 			} else {
   1662 				if (word !== '') {
   1663 					words[word] = true;
   1664 				}
   1665 				word = c.toLowerCase();
   1666 			}
   1667 			cclass = cc;
   1668 		}
   1669 		if (word !== '') {
   1670 			words[word] = true;
   1671 		}
   1672 		
   1673 		return Object.keys(words).map(function (w) {
   1674 			// Trim trailing single quotes
   1675 			if (w.slice(-1) == "'") {
   1676 				w = w.substr(0, w.length - 1);
   1677 			}
   1678 			return w;
   1679 		});
   1680 	}
   1681 	
   1682 	function _getScriptExtension() {
   1683 		return Zotero.isWin ? 'vbs' : 'sh';
   1684 	}
   1685 
   1686 }
	www Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| Submodules \| README \| LICENSE