www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 9161237e9033ddcaea67f63f1a6058136e2183df
parent 0271d042e833406aab4734dd8a34295b832b40ff
Author: Dan Stillman <dstillman@zotero.org>
Date:   Tue,  5 Nov 2013 16:21:11 -0500

Merge branch '4.0_fulltext_sync' into 4.0

Diffstat:
Mchrome/content/zotero/preferences/preferences_sync.xul | 9+++++++++
Mchrome/content/zotero/xpcom/attachments.js | 6++----
Mchrome/content/zotero/xpcom/db.js | 4++--
Mchrome/content/zotero/xpcom/fulltext.js | 686+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------
Mchrome/content/zotero/xpcom/schema.js | 11+++++++++--
Mchrome/content/zotero/xpcom/sync.js | 112++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
Mchrome/content/zotero/xpcom/zotero.js | 15+++++++++++++--
Mchrome/locale/en-US/zotero/preferences.dtd | 2++
Mchrome/locale/en-US/zotero/zotero.properties | 6++++++
Mdefaults/preferences/zotero.js | 1+
Mresource/schema/userdata.sql | 1+
11 files changed, 720 insertions(+), 133 deletions(-)

diff --git a/chrome/content/zotero/preferences/preferences_sync.xul b/chrome/content/zotero/preferences/preferences_sync.xul @@ -32,6 +32,7 @@ <preferences> <preference id="pref-sync-autosync" name="extensions.zotero.sync.autoSync" type="bool"/> <preference id="pref-sync-username" name="extensions.zotero.sync.server.username" type="string" instantApply="true"/> + <preference id="pref-sync-fulltext-enabled" name="extensions.zotero.sync.fulltext.enabled" type="bool"/> <preference id="pref-storage-enabled" name="extensions.zotero.sync.storage.enabled" type="bool"/> <preference id="pref-storage-protocol" name="extensions.zotero.sync.storage.protocol" type="string" onchange="Zotero_Preferences.Sync.unverifyStorageServer()"/> @@ -76,6 +77,14 @@ <box/> <checkbox label="&zotero.preferences.sync.syncAutomatically;" preference="pref-sync-autosync"/> </row> + <row> + <box/> + <vbox> + <checkbox label="&zotero.preferences.sync.syncFullTextContent;" + preference="pref-sync-fulltext-enabled" + tooltiptext="&zotero.preferences.sync.syncFullTextContent.desc;"/> + </vbox> + </row> <!-- <row> <box/> diff --git a/chrome/content/zotero/xpcom/attachments.js b/chrome/content/zotero/xpcom/attachments.js @@ -567,11 +567,9 @@ Zotero.Attachments = new function(){ var f = function() { if (mimeType == 'application/pdf') { Zotero.Fulltext.indexPDF(file, itemID); - Zotero.Notifier.trigger('refresh', 'item', itemID); } - if (Zotero.MIME.isTextType(mimeType)) { + else if (Zotero.MIME.isTextType(mimeType)) { Zotero.Fulltext.indexDocument(document, itemID); - Zotero.Notifier.trigger('refresh', 'item', itemID); } if (callback) { callback(attachmentItem); @@ -981,7 +979,7 @@ Zotero.Attachments = new function(){ function getStorageDirectory(itemID) { if (!itemID) { - throw ("itemID not provided in Zotero.Attachments.getStorageDirectory()"); + throw new Error("itemID not provided in Zotero.Attachments.getStorageDirectory()"); } var item = Zotero.Items.get(itemID); if (!item) { diff --git a/chrome/content/zotero/xpcom/db.js b/chrome/content/zotero/xpcom/db.js @@ -804,7 +804,7 @@ Zotero.DBConnection.prototype.closeDatabase = function () { } -Zotero.DBConnection.prototype.backupDatabase = function (suffix) { +Zotero.DBConnection.prototype.backupDatabase = function (suffix, force) { if (!suffix) { var numBackups = Zotero.Prefs.get("backup.numBackups"); if (numBackups < 1) { @@ -839,7 +839,7 @@ Zotero.DBConnection.prototype.backupDatabase = function (suffix) { var file = Zotero.getZoteroDatabase(this._dbName); // For standard backup, make sure last backup is old enough to replace - if (!suffix) { + if (!suffix && !force) { var backupFile = Zotero.getZoteroDatabase(this._dbName, 'bak'); if (backupFile.exists()) { var currentDBTime = file.lastModifiedTime; diff --git a/chrome/content/zotero/xpcom/fulltext.js b/chrome/content/zotero/xpcom/fulltext.js @@ -24,7 +24,6 @@ */ Zotero.Fulltext = new function(){ - const FULLTEXT_VERSION = 1; const CACHE_FILE = '.zotero-ft-cache'; this.init = init; @@ -38,7 +37,6 @@ Zotero.Fulltext = new function(){ this.indexFile = indexFile; this.indexPDF = indexPDF; this.indexItems = indexItems; - this.findTextInFile = findTextInFile; this.findTextInItems = findTextInItems; this.clearItemWords = clearItemWords; this.getPages = getPages; @@ -56,7 +54,6 @@ Zotero.Fulltext = new function(){ this.clearCacheFiles = clearCacheFiles; //this.clearItemContent = clearItemContent; this.purgeUnusedWords = purgeUnusedWords; - this.HTMLToText = HTMLToText; this.semanticSplitter = semanticSplitter; this.__defineGetter__("pdfToolsDownloadBaseURL", function() { return 'http://www.zotero.org/download/xpdf/'; }); @@ -72,6 +69,7 @@ Zotero.Fulltext = new function(){ this.__defineGetter__("INDEX_STATE_PARTIAL", function () { return 2; }); this.__defineGetter__("INDEX_STATE_INDEXED", function () { return 3; }); + const _processorCacheFile = '.zotero-ft-unprocessed'; var _pdfConverterVersion = null; var _pdfConverterFileName = null; @@ -80,6 +78,16 @@ Zotero.Fulltext = new function(){ var _pdfInfoFileName = null; var _pdfInfo = null; // nsIFile to executable + var _idleObserverIsRegistered = false; + var _idleObserverDelay = 5; + var _processorTimer = null; + var _upgradeCheck = true; + + const SYNC_STATE_UNSYNCED = 0; + const SYNC_STATE_IN_SYNC = 1; + const SYNC_STATE_TO_PROCESS = 2; + const SYNC_STATE_TO_DOWNLOAD = 3; + var self = this; function init() { @@ -98,6 +106,18 @@ Zotero.Fulltext = new function(){ this.registerPDFTool('converter'); this.registerPDFTool('info'); + + // TEMP: Remove after 4.1 DB schema change + var cols = Zotero.DB.getColumns('fulltextItems'); + if (cols.indexOf("synced") == -1) { + Zotero.DB.beginTransaction(); + Zotero.DB.query("ALTER TABLE fulltextItems ADD COLUMN synced INT DEFAULT 0"); + Zotero.DB.query("REPLACE INTO settings (setting, key, value) VALUES ('fulltext', 'downloadAll', 1)"); + Zotero.DB.commitTransaction(); + } + + this.startContentProcessor(); + Zotero.addShutdownListener(this.stopContentProcessor); } @@ -192,7 +212,7 @@ Zotero.Fulltext = new function(){ /* * Index multiple words at once */ - function indexWords(itemID, words){ + function indexWords(itemID, words) { if (!words || !words.length || !itemID){ return false; } @@ -231,9 +251,10 @@ Zotero.Fulltext = new function(){ } while (done < numWords); - Zotero.DB.query("REPLACE INTO fulltextItems (itemID, version) VALUES (?,?)", - [itemID, FULLTEXT_VERSION]); - + if (!Zotero.DB.valueQuery("SELECT COUNT(*) FROM fulltextItems WHERE itemID=?", itemID)) { + let sql = "INSERT INTO fulltextItems (itemID, version) VALUES (?,?)"; + Zotero.DB.query(sql, [itemID, 0]); + } // Handle bound parameters manually for optimal speed var statement1 = Zotero.DB.getStatement("INSERT INTO fulltextWords (word) VALUES (?)"); @@ -266,7 +287,7 @@ Zotero.Fulltext = new function(){ } - function indexString(text, charset, itemID){ + function indexString(text, charset, itemID, stats, version, synced) { try { Zotero.UnresponsiveScriptIndicator.disable(); @@ -275,7 +296,23 @@ Zotero.Fulltext = new function(){ Zotero.DB.beginTransaction(); this.clearItemWords(itemID, true); - this.indexWords(itemID, words); + this.indexWords(itemID, words, stats, version, synced); + + var sql = "UPDATE fulltextItems SET synced=?"; + var params = [synced ? parseInt(synced) : SYNC_STATE_UNSYNCED]; + if (stats) { + for (let stat in stats) { + sql += ", " + stat + "=?"; + params.push(stats[stat] ? parseInt(stats[stat]) : null); + } + } + if (version) { + sql += ", version=?"; + params.push(parseInt(version)); + } + sql += " WHERE itemID=?"; + params.push(itemID); + Zotero.DB.query(sql, params); /* var sql = "REPLACE INTO fulltextContent (itemID, textContent) VALUES (?,?)"; @@ -283,6 +320,14 @@ Zotero.Fulltext = new function(){ */ Zotero.DB.commitTransaction(); + + // If there's a processor cache file, delete it (whether or not we just used it) + var cacheFile = this.getItemProcessorCacheFile(itemID); + if (cacheFile.exists()) { + cacheFile.remove(false); + } + + Zotero.Notifier.trigger('refresh', 'item', itemID); } finally { Zotero.UnresponsiveScriptIndicator.enable(); @@ -312,25 +357,25 @@ Zotero.Fulltext = new function(){ return false; } - var text = document.body.innerHTML; - var maxLength = Zotero.Prefs.get('fulltext.textMaxLength'); - if (text.length > maxLength) { + var obj = convertItemHTMLToText(itemID, document.body.innerHTML, maxLength); + var text = obj.text; + var totalChars = obj.totalChars; + + if (totalChars > maxLength) { Zotero.debug('Only indexing first ' + maxLength + ' characters of item ' + itemID + ' in indexDocument()'); - text = text.substr(0, maxLength); } - text = text.replace(/(>)/g, '$1 '); - text = this.HTMLToText(text); this.indexString(text, document.characterSet, itemID); - - var charsIndexed = Math.min(maxLength, text.length); - this.setChars(itemID, { indexed: charsIndexed, total: text.length }); + this.setChars(itemID, { indexed: text.length, total: totalChars }); } - function indexFile(file, mimeType, charset, itemID, maxLength, isCacheFile) { + /** + * @param {Boolean} [complete=FALSE] Index the file in its entirety, ignoring maxLength + */ + function indexFile(file, mimeType, charset, itemID, complete, isCacheFile) { if (!file.exists()){ Zotero.debug('File not found in indexFile()', 2); return false; @@ -343,18 +388,10 @@ Zotero.Fulltext = new function(){ return false; } - if (maxLength == undefined || maxLength === true) { - maxLength = Zotero.Prefs.get('fulltext.textMaxLength'); - } - // If maxLength is explicitly false, index everything - else if (maxLength === false || maxLength === null) { - maxLength = false; - } - if (mimeType == 'application/pdf') { try { Zotero.UnresponsiveScriptIndicator.disable(); - return this.indexPDF(file, itemID, !maxLength); + return this.indexPDF(file, itemID, complete); } finally { Zotero.UnresponsiveScriptIndicator.enable(); @@ -373,29 +410,27 @@ Zotero.Fulltext = new function(){ Zotero.debug('Indexing file ' + file.path); - var text = Zotero.File.getContents(file, charset, maxLength); - // Split elements to avoid word concatentation - text = text.replace(/(>)/g, '$1 '); - text = this.HTMLToText(text); + var text = Zotero.File.getContents(file, charset); + var totalChars = text.length; + var maxLength = complete ? false : Zotero.Prefs.get('fulltext.textMaxLength'); + + if (mimeType == 'text/html') { + let obj = convertItemHTMLToText(itemID, text, maxLength); + text = obj.text; + totalChars = obj.totalChars; + } + else { + if (maxLength && text.length > maxLength) { + text = text.substr(0, maxLength); + } + } + this.indexString(text, charset, itemID); - // Record number of characters indexed + // Record the number of characters indexed (unless we're indexing a (PDF) cache file, + // in which case the stats are coming from elsewhere) if (!isCacheFile) { - try { - var totalChars = this.getTotalCharsFromFile(itemID); - } - catch (e) { - Zotero.debug(e); - Components.utils.reportError(e); - totalChars = 0; - } - if (maxLength) { - var charsIndexed = Math.min(maxLength, totalChars); - } - else { - var charsIndexed = totalChars; - } - this.setChars(itemID, { indexed: charsIndexed, total: totalChars }); + this.setChars(itemID, { indexed: text.length, total: totalChars }); } return true; @@ -491,7 +526,7 @@ Zotero.Fulltext = new function(){ } Zotero.DB.beginTransaction(); - this.indexFile(cacheFile, 'text/plain', 'utf-8', itemID, false, true); + this.indexFile(cacheFile, 'text/plain', 'utf-8', itemID, true, true); this.setPages(itemID, { indexed: pagesIndexed, total: totalPages }); Zotero.DB.commitTransaction(); return true; @@ -499,7 +534,7 @@ Zotero.Fulltext = new function(){ function indexItems(items, complete, ignoreErrors) { - if (items.constructor.name != 'Array') { + if (!Array.isArray(items)) { items = [items]; } var items = Zotero.Items.get(items); @@ -507,20 +542,22 @@ Zotero.Fulltext = new function(){ Zotero.DB.beginTransaction(); - for each(var i in items){ - if (!i.isAttachment()){ + for each (let item in items) { + if (!item.isAttachment()) { continue; } - var file = i.getFile(); + let itemID = item.id; + + var file = item.getFile(); if (!file){ - Zotero.debug("No file to index for item " + i.id + " in Fulltext.indexItems()"); + Zotero.debug("No file to index for item " + itemID + " in Fulltext.indexItems()"); continue; } if (ignoreErrors) { try { - this.indexFile(file, i.attachmentMIMEType, i.attachmentCharset, i.id, !complete); + this.indexFile(file, item.attachmentMIMEType, item.attachmentCharset, itemID, complete); } catch (e) { Zotero.debug(e, 1); @@ -529,7 +566,7 @@ Zotero.Fulltext = new function(){ } } else { - this.indexFile(file, i.attachmentMIMEType, i.attachmentCharset, i.id, !complete); + this.indexFile(file, item.attachmentMIMEType, item.attachmentCharset, itemID, complete); } } @@ -537,8 +574,354 @@ Zotero.Fulltext = new function(){ } + // + // Full-text content syncing + // + /** + * Get content and stats that haven't yet been synced + * + * @param {Integer} maxChars Maximum total characters to include. + * The total can go over this if there's a + * single large item. + * @return {Array<Object>} + */ + this.getUnsyncedContent = function (maxChars) { + var maxLength = Zotero.Prefs.get('fulltext.textMaxLength'); + var first = true; + var chars = 0; + var contentItems = []; + var sql = "SELECT itemID, indexedChars, totalChars, indexedPages, totalPages " + + "FROM fulltextItems JOIN items USING (itemID) WHERE synced=" + SYNC_STATE_UNSYNCED + + " ORDER BY clientDateModified DESC"; + var rows = Zotero.DB.query(sql) || []; + for each (let row in rows) { + let text; + let itemID = row.itemID; + let item = Zotero.Items.get(itemID); + let libraryKey = item.libraryID + "/" + item.key; + let mimeType = item.attachmentMIMEType; + if (isCachedMIMEType(mimeType) || Zotero.MIME.isTextType(mimeType)) { + try { + let cacheFile = this.getItemCacheFile(itemID); + if (cacheFile.exists()) { + Zotero.debug("Adding full-text content from cache " + + "file for item " + libraryKey); + text = Zotero.File.getContents(cacheFile); + } + else { + if (!Zotero.MIME.isTextType(mimeType)) { + Zotero.debug("Full-text content cache file doesn't exist for item " + + libraryKey, 2); + continue; + } + + let file = item.getFile(); + if (!file) { + Zotero.debug("File doesn't exist getting full-text content for item " + + libraryKey, 2); + continue; + } + + Zotero.debug("Adding full-text content from file for item " + libraryKey); + text = Zotero.File.getContents(file, item.attachmentCharset); + + // If HTML, convert to plain text first, and cache the result + if (item.attachmentMIMEType == 'text/html') { + let obj = convertItemHTMLToText( + itemID, + text, + // Include in the cache file only as many characters as we + // indexed previously + row.indexedChars + ); + text = obj.text; + } + else { + // Include only as many characters as we've indexed + text = text.substr(0, row.indexedChars); + } + } + } + catch (e) { + Zotero.debug(e, 1); + Components.utils.reportError(e); + continue; + } + } + else { + Zotero.debug("Skipping non-text file getting full-text content for item " + + libraryKey, 2); + + // Delete rows for items that weren't supposed to be indexed + this.clearItemWords(itemID); + continue; + } + + // If this isn't the first item and it would put us over the limit, + // skip it + if (!first && maxChars && ((chars + text.length) > maxChars)) { + continue; + } + chars += text.length; + first = false; + + contentItems.push({ + libraryID: item.libraryID, + key: item.key, + text: text, + indexedChars: row.indexedChars ? row.indexedChars : 0, + totalChars: row.totalChars ? row.totalChars : 0, + indexedPages: row.indexedPages ? row.indexedPages : 0, + totalPages: row.totalPages ? row.totalPages : 0 + }); + + if (maxChars && chars > maxChars) { + break; + } + } + return contentItems; + } + + + /** + * @return {String} PHP-formatted POST data for items not yet downloaded + */ + this.getUndownloadedPostData = function () { + // On upgrade, get all content + var sql = "SELECT value FROM settings WHERE setting='fulltext' AND key='downloadAll'"; + if (Zotero.DB.valueQuery(sql)) { + return "&ftkeys=all"; + } + + var sql = "SELECT itemID FROM fulltextItems WHERE synced=" + + SYNC_STATE_TO_DOWNLOAD; + var itemIDs = Zotero.DB.columnQuery(sql); + if (!itemIDs) { + return ""; + } + var undownloaded = {}; + for each (let itemID in itemIDs) { + let item = Zotero.Items.get(itemID); + let libraryID = item.libraryID + libraryID = libraryID ? libraryID : Zotero.libraryID; + if (!undownloaded[libraryID]) { + undownloaded[libraryID] = []; + } + undownloaded[libraryID].push(item.key); + } + var data = ""; + for (let libraryID in undownloaded) { + for (let i = 0; i < undownloaded[libraryID].length; i++) { + data += "&" + encodeURIComponent("ftkeys[" + libraryID + "][" + i + "]") + + "=" + undownloaded[libraryID][i]; + } + } + return data; + } + + + /** + * Save full-text content and stats to a cache file + */ + this.setItemContent = function (libraryID, key, text, stats, version) { + var item = Zotero.Items.getByLibraryAndKey(libraryID, key); + if (!item) { + let msg = "Item not found setting full-text content"; + Zotero.debug(msg, 1); + Components.utils.reportError(msg); + return; + } + var itemID = item.id; + + if (text !== '') { + var cacheFile = this.getItemProcessorCacheFile(itemID); + + // If a storage directory doesn't exist, create it + if (!cacheFile.parent.exists()) { + Zotero.Attachments.createDirectoryForItem(itemID); + } + + Zotero.debug("Writing full-text content and data to " + cacheFile.path); + Zotero.File.putContents(cacheFile, JSON.stringify({ + indexedChars: stats.indexedChars, + totalChars: stats.totalChars, + indexedPages: stats.indexedPages, + totalPages: stats.totalPages, + version: version, + text: text + })); + var synced = SYNC_STATE_TO_PROCESS; + } + else { + Zotero.debug("Marking full-text content for download"); + var synced = SYNC_STATE_TO_DOWNLOAD; + } + + // Mark the item as unprocessed + if (Zotero.DB.valueQuery("SELECT COUNT(*) FROM fulltextItems WHERE itemID=?", itemID)) { + Zotero.DB.query("UPDATE fulltextItems SET synced=? WHERE itemID=?", [synced, itemID]); + } + // If not yet indexed, add an empty row + else { + Zotero.DB.query( + "REPLACE INTO fulltextItems (itemID, version, synced) VALUES (?, 0, ?)", + [itemID, synced] + ); + } + + if (_upgradeCheck) { + Zotero.DB.query("DELETE FROM settings WHERE setting='fulltext' AND key='downloadAll'"); + _upgradeCheck = false; + } + + this.startContentProcessor(); + } + + + /** + * Start the idle observer for the background content processor + */ + this.startContentProcessor = function () { + if (!_idleObserverIsRegistered) { + Zotero.debug("Initializing full-text content ingester idle observer"); + var idleService = Components.classes["@mozilla.org/widget/idleservice;1"] + .getService(Components.interfaces.nsIIdleService); + idleService.addIdleObserver(this.idleObserver, _idleObserverDelay); + _idleObserverIsRegistered = true; + } + } + + /** + * Stop the idle observer and a running timer, if there is one + */ + this.stopContentProcessor = function () { + if (_idleObserverIsRegistered) { + var idleService = Components.classes["@mozilla.org/widget/idleservice;1"] + .getService(Components.interfaces.nsIIdleService); + idleService.removeIdleObserver(this.idleObserver, _idleObserverDelay); + _idleObserverIsRegistered = false; + } + + if (_processorTimer) { + _processorTimer.cancel(); + _processorTimer = null; + } + } + + /** + * + * @param {Array<Integer>} itemIDs An array of itemIDs to process; if this + * is omitted, a database query is made + * to find unprocessed content + * @return {Boolean} TRUE if there's more content to process; FALSE otherwise + */ + this.processUnprocessedContent = function (itemIDs) { + if (!itemIDs) { + Zotero.debug("Checking for unprocessed full-text content"); + let sql = "SELECT itemID FROM fulltextItems WHERE synced=" + + SYNC_STATE_TO_PROCESS; + itemIDs = Zotero.DB.columnQuery(sql) || []; + } + // If there's no more unprocessed content, stop the idle observer + if (!itemIDs.length) { + Zotero.debug("No unprocessed full-text content found"); + this.stopContentProcessor(); + return; + } + + let itemID = itemIDs.shift(); + let item = Zotero.Items.get(itemID); + + Zotero.debug("Processing full-text content for item " + item.libraryKey); + + Zotero.Fulltext.indexFromProcessorCache(itemID) + .then(function () { + if (itemIDs.length) { + if (!_processorTimer) { + _processorTimer = Components.classes["@mozilla.org/timer;1"] + .createInstance(Components.interfaces.nsITimer); + } + _processorTimer.initWithCallback( + function () { + Zotero.Fulltext.processUnprocessedContent(itemIDs); + }, + 100, + Components.interfaces.nsITimer.TYPE_ONE_SHOT + ); + } + }) + .done(); + } + + this.idleObserver = { + observe: function (subject, topic, data) { + // On idle, start the background processor + if (topic == 'idle') { + Zotero.Fulltext.processUnprocessedContent(); + } + // When back from idle, stop the processor (but keep the idle + // observer registered) + else if (topic == 'active') { + if (_processorTimer) { + Zotero.debug("Stopping full-text content processor"); + _processorTimer.cancel(); + } + } + } + }; + + + this.indexFromProcessorCache = function (itemID) { + var self = this; + return Q.fcall(function () { + var cacheFile = self.getItemProcessorCacheFile(itemID); + if (!cacheFile.exists()) { + Zotero.debug("Full-text content processor cache file doesn't exist for item " + itemID); + return false; + } + + let data; + + return Zotero.File.getContentsAsync(cacheFile) + .then(function (json) { + data = JSON.parse(json); + + // Write the text content to the regular cache file + cacheFile = self.getItemCacheFile(itemID); + + Zotero.debug("Writing full-text content to " + cacheFile.path); + return Zotero.File.putContentsAsync(cacheFile, data.text, "UTF-8"); + }) + .then(function () { + Zotero.Fulltext.indexString( + data.text, + "UTF-8", + itemID, + { + indexedChars: data.indexedChars, + totalChars: data.totalChars, + indexedPages: data.indexedPages, + totalPages: data.totalPages + }, + data.version, + 1 + ); + }); + }) + .catch(function (e) { + Components.utils.reportError(e); + Zotero.debug(e, 1); + return false; + }); + } + + // + // End full-text content syncing + // + + /* - * Scan a file for a text string + * Scan a string for another string * * _items_ -- one or more attachment items to search * _searchText_ -- text pattern to search for @@ -548,21 +931,7 @@ Zotero.Fulltext = new function(){ * * - Slashes in regex are optional */ - function findTextInFile(file, charset, searchText, mode){ - Zotero.debug("Searching for text '" + searchText + "' in " + file.path); - - var maxLength = Zotero.Prefs.get('fulltext.textMaxLength'); - var str = Zotero.File.getContents(file, charset, maxLength); - - // If not binary mode, convert HTML to text - if (!mode || mode.indexOf('Binary')==-1){ - // Split elements to avoid word concatentation - str = str.replace(/(>)/g, '$1 '); - - // Parse to avoid searching on HTML - str = this.HTMLToText(str); - } - + this.findTextInString = function (content, searchText, mode) { switch (mode){ case 'regexp': case 'regexpCS': @@ -583,7 +952,7 @@ Zotero.Fulltext = new function(){ try { var re = new RegExp(searchText, flags); - var matches = re.exec(str); + var matches = re.exec(content); } catch (e) { Zotero.debug(e, 1); @@ -591,7 +960,7 @@ Zotero.Fulltext = new function(){ } if (matches){ Zotero.debug("Text found"); - return str.substr(matches.index, 50); + return content.substr(matches.index, 50); } break; @@ -599,12 +968,12 @@ Zotero.Fulltext = new function(){ default: // Case-insensitive searchText = searchText.toLowerCase(); - str = str.toLowerCase(); + content = content.toLowerCase(); - var pos = str.indexOf(searchText); + var pos = content.indexOf(searchText); if (pos!=-1){ Zotero.debug('Text found'); - return str.substr(pos, 50); + return content.substr(pos, 50); } } @@ -633,42 +1002,70 @@ Zotero.Fulltext = new function(){ var items = Zotero.Items.get(items); var found = []; - for each(var i in items){ - if (!i.isAttachment()){ + for each (let item in items) { + if (!item.isAttachment()) { continue; } - var file = i.getFile(); - if (!file){ - continue; - } + let itemID = item.id; + let content; + let mimeType = item.attachmentMIMEType; + let maxLength = Zotero.Prefs.get('fulltext.textMaxLength'); + let binaryMode = mode && mode.indexOf('Binary') != -1; - var mimeType = i.attachmentMIMEType; if (isCachedMIMEType(mimeType)) { - var file = this.getItemCacheFile(i.id); + let file = this.getItemCacheFile(itemID); if (!file.exists()) { continue; } - mimeType = 'text/plain'; - var charset = 'utf-8'; + Zotero.debug("Searching for text '" + searchText + "' in " + file.path); + content = Zotero.File.getContents(file, 'utf-8', maxLength); } else { // If not binary mode, only scan plaintext files - if (!mode || mode.indexOf('Binary') == -1) { + if (!binaryMode) { if (!Zotero.MIME.isTextType(mimeType)) { Zotero.debug('Not scanning MIME type ' + mimeType, 4); continue; } } - var charset = i.attachmentCharset; + // Check for a cache file + let cacheFile = this.getItemCacheFile(itemID); + if (cacheFile.exists()) { + Zotero.debug("Searching for text '" + searchText + "' in " + cacheFile.path); + content = Zotero.File.getContents(cacheFile, 'utf-8', maxLength); + } + else { + // If that doesn't exist, check for the actual file + let file = item.getFile(); + if (!file) { + continue; + } + + Zotero.debug("Searching for text '" + searchText + "' in " + file.path); + content = Zotero.File.getContents(file, item.attachmentCharset); + + // If HTML and not binary mode, convert to text + if (mimeType == 'text/html' && !binaryMode) { + // Include in the cache file only as many characters as we've indexed + let chars = this.getChars(itemID); + + let obj = convertItemHTMLToText( + itemID, content, chars ? chars.indexedChars : null + ); + content = obj.text; + } + } } - var match = this.findTextInFile(file, charset, searchText, mode); - - if (match != -1){ - found.push({id:i.getID(), match:match}); + let match = this.findTextInString(content, searchText, mode); + if (match != -1) { + found.push({ + id: itemID, + match: match + }); } } @@ -681,8 +1078,8 @@ Zotero.Fulltext = new function(){ var sql = "SELECT rowid FROM fulltextItems WHERE itemID=? LIMIT 1"; var indexed = Zotero.DB.valueQuery(sql, itemID); if (indexed) { - Zotero.DB.query("DELETE FROM fulltextItems WHERE itemID=?", itemID); Zotero.DB.query("DELETE FROM fulltextItemWords WHERE itemID=?", itemID); + Zotero.DB.query("DELETE FROM fulltextItems WHERE itemID=?", itemID); } Zotero.DB.commitTransaction(); @@ -760,15 +1157,27 @@ Zotero.Fulltext = new function(){ function setPages(itemID, obj) { var sql = "UPDATE fulltextItems SET indexedPages=?, totalPages=? WHERE itemID=?"; - Zotero.DB.query(sql, [obj.indexed ? obj.indexed : null, - obj.total ? obj.total : null, itemID]); + Zotero.DB.query( + sql, + [ + obj.indexed ? parseInt(obj.indexed) : null, + obj.total ? parseInt(obj.total) : null, + itemID + ] + ); } function setChars(itemID, obj) { var sql = "UPDATE fulltextItems SET indexedChars=?, totalChars=? WHERE itemID=?"; - Zotero.DB.query(sql, [obj.indexed ? obj.indexed : null, - obj.total ? obj.total : null, itemID]); + Zotero.DB.query( + sql, + [ + obj.indexed ? parseInt(obj.indexed) : null, + obj.total ? parseInt(obj.total) : null, + itemID + ] + ); } @@ -878,6 +1287,13 @@ Zotero.Fulltext = new function(){ } + this.getItemProcessorCacheFile = function (itemID) { + var cacheFile = Zotero.Attachments.getStorageDirectory(itemID); + cacheFile.append(_processorCacheFile); + return cacheFile; + } + + /* * Returns true if an item can be reindexed * @@ -971,18 +1387,14 @@ Zotero.Fulltext = new function(){ } Zotero.debug('Clearing full-text cache file for item ' + itemID); - switch (item.attachmentMIMEType) { - case 'application/pdf': - var cacheFile = this.getItemCacheFile(itemID); - if (cacheFile.exists()) { - try { - cacheFile.remove(false); - } - catch (e) { - Zotero.File.checkFileAccessError(e, cacheFile, 'delete'); - } - } - break; + var cacheFile = this.getItemCacheFile(itemID); + if (cacheFile.exists()) { + try { + cacheFile.remove(false); + } + catch (e) { + Zotero.File.checkFileAccessError(e, cacheFile, 'delete'); + } } } @@ -1022,23 +1434,53 @@ Zotero.Fulltext = new function(){ } - function HTMLToText(text){ - var nsIFC = - Components.classes['@mozilla.org/widget/htmlformatconverter;1']. - createInstance(Components.interfaces.nsIFormatConverter); - var from = Components.classes['@mozilla.org/supports-string;1']. - createInstance(Components.interfaces.nsISupportsString); - from.data = text; - var to = {value:null}; + /** + * Convert HTML to text for an item and cache the result + */ + function convertItemHTMLToText(itemID, html, maxLength) { + // Split elements to avoid word concatentation + html = html.replace(/>/g, '> '); + + var text = HTMLToText(html); + var totalChars = text.length; + + if (maxLength) { + text = text.substr(0, maxLength); + } + + // Write the converted text to a cache file + var cacheFile = Zotero.Fulltext.getItemCacheFile(itemID); + Zotero.debug("Writing converted full-text HTML content to " + cacheFile.path); + if (!cacheFile.parent.exists()) { + Zotero.Attachments.createDirectoryForItem(itemID); + } + Zotero.File.putContentsAsync(cacheFile, text) + .catch(function (e) { + Zotero.debug(e, 1); + Components.utils.reportError(e); + }); + + return { + text: text, + totalChars: totalChars + }; + } + + function HTMLToText(html) { + var nsIFC = Components.classes['@mozilla.org/widget/htmlformatconverter;1'] + .createInstance(Components.interfaces.nsIFormatConverter); + var from = Components.classes['@mozilla.org/supports-string;1'] + .createInstance(Components.interfaces.nsISupportsString); + from.data = html; + var to = { value: null }; try { - nsIFC.convert('text/html', from, from.toString().length, - 'text/unicode', to, {}); + nsIFC.convert('text/html', from, from.toString().length, 'text/unicode', to, {}); to = to.value.QueryInterface(Components.interfaces.nsISupportsString); return to.toString(); } - catch(e){ + catch(e) { Zotero.debug(e, 1); - return text; + return html; } } diff --git a/chrome/content/zotero/xpcom/schema.js b/chrome/content/zotero/xpcom/schema.js @@ -1446,7 +1446,10 @@ Zotero.Schema = new function(){ Zotero.Schema.updateCustomTables(true); _updateDBVersion('system', _getSchemaSQLVersion('system')); - _updateDBVersion('userdata', _getSchemaSQLVersion('userdata')); + // TEMP: 77 is for full-text syncing. New users don't need the + // prompt, so initialize new databases to 77. + //_updateDBVersion('userdata', _getSchemaSQLVersion('userdata')); + _updateDBVersion('userdata', 77); _updateDBVersion('userdata2', _getSchemaSQLVersion('userdata2')); _updateDBVersion('triggers', _getSchemaSQLVersion('triggers')); @@ -1792,7 +1795,11 @@ Zotero.Schema = new function(){ return false; } - if (fromVersion > toVersion){ + // 77 is a hack for full-text content syncing + if (fromVersion == 77) { + return false; + } + else if (fromVersion > toVersion) { throw("Zotero user data DB version is newer than SQL file"); } diff --git a/chrome/content/zotero/xpcom/sync.js b/chrome/content/zotero/xpcom/sync.js @@ -55,6 +55,10 @@ Zotero.Sync = new function() { setting: { singular: 'Setting', plural: 'Settings' + }, + fulltext: { + singular: 'Fulltext', + plural: 'Fulltexts' } }; }); @@ -132,7 +136,7 @@ Zotero.Sync = new function() { } for (var type in this.syncObjects) { - if (type == 'setting') { + if (type == 'setting' || type == 'fulltext') { continue; } @@ -451,6 +455,9 @@ Zotero.Sync.EventListener = new function () { if (type == 'setting') { [libraryID, key] = ids[i].split("/"); } + else if (type == 'fulltext') { + continue; + } else { var oldItem = extraData[ids[i]].old; libraryID = oldItem.primary.libraryID; @@ -1361,6 +1368,42 @@ Zotero.Sync.Server = new function () { _error(e); } + // TEMP + if (Zotero.Prefs.get("sync.fulltext.enabled") && + Zotero.DB.valueQuery("SELECT version FROM version WHERE schema='userdata'") < 77) { + // Don't show multiple times on idle + _syncInProgress = true; + + let ps = Components.classes["@mozilla.org/embedcomp/prompt-service;1"] + .getService(Components.interfaces.nsIPromptService); + let buttonFlags = (ps.BUTTON_POS_0) * (ps.BUTTON_TITLE_IS_STRING) + + (ps.BUTTON_POS_1) * (ps.BUTTON_TITLE_IS_STRING) + + ps.BUTTON_DELAY_ENABLE; + let index = ps.confirmEx( + null, + Zotero.getString('sync.fulltext.upgradePrompt.title'), + Zotero.getString('sync.fulltext.upgradePrompt.text') + "\n\n" + + Zotero.getString('sync.fulltext.upgradePrompt.changeLater'), + buttonFlags, + Zotero.getString('sync.fulltext.upgradePrompt.enable'), + Zotero.getString('general.notNow'), + null, null, {} + ); + + _syncInProgress = false; + + // Enable + if (index == 0) { + Zotero.DB.backupDatabase(76, true); + Zotero.DB.query("UPDATE version SET version=77 WHERE schema='userdata'"); + Zotero.wait(1000); + } + // Disable + else { + Zotero.Prefs.set("sync.fulltext.enabled", false); + } + } + username = encodeURIComponent(username); password = encodeURIComponent(password); var body = _apiVersionComponent @@ -1462,6 +1505,13 @@ Zotero.Sync.Server = new function () { body += '&upload=1'; } + if (Zotero.Prefs.get("sync.fulltext.enabled")) { + body += "&ft=1" + Zotero.Fulltext.getUndownloadedPostData(); + } + else { + body += "&ft=0"; + } + Zotero.Sync.Runner.setSyncStatus(Zotero.getString('sync.status.gettingUpdatedData')); Zotero.HTTP.doPost(url, body, function (xmlhttp) { @@ -1694,6 +1744,14 @@ Zotero.Sync.Server = new function () { var sql = "UPDATE syncedSettings SET synced=1"; Zotero.DB.query(sql); + if (syncSession.fulltextItems && syncSession.fulltextItems.length) { + let sql = "UPDATE fulltextItems SET synced=1 WHERE itemID=?"; + for each (let lk in syncSession.fulltextItems) { + let item = Zotero.Items.getByLibraryAndKey(lk.libraryID, lk.key); + Zotero.DB.query(sql, item.id); + } + } + //throw('break2'); Zotero.DB.commitTransaction(); @@ -2825,6 +2883,26 @@ Zotero.Sync.Server.Data = new function() { Zotero.SyncedSettings.setSynchronous(libraryID, name, value, version, true); continue; } + else if (type == 'fulltext') { + if (!libraryID) { + libraryID = 0; + } + let key = objectNode.getAttribute('key'); + Zotero.debug("Processing remote full-text content for item " + libraryID + "/" + key); + Zotero.Fulltext.setItemContent( + libraryID, + key, + objectNode.textContent, + { + indexedChars: parseInt(objectNode.getAttribute('indexedChars')), + totalChars: parseInt(objectNode.getAttribute('totalChars')), + indexedPages: parseInt(objectNode.getAttribute('indexedPages')), + totalPages: parseInt(objectNode.getAttribute('totalPages')) + }, + parseInt(objectNode.getAttribute('version')) + ); + continue; + } var key = objectNode.getAttribute('key'); var objLibraryKeyHash = Zotero[Types].makeLibraryKeyHash(libraryID, key); @@ -3537,6 +3615,38 @@ Zotero.Sync.Server.Data = new function() { docElem.appendChild(settingsNode); } + if (Zotero.Prefs.get("sync.fulltext.enabled")) { + // Add up to 500K characters of full-text content + try { + var rows = Zotero.Fulltext.getUnsyncedContent(500000); + } + catch (e) { + Zotero.debug(e, 1); + Components.utils.reportError(e); + var rows = []; + } + if (rows.length) { + let fulltextsNode = doc.createElement('fulltexts'); + syncSession.fulltextItems = []; + for (let i=0; i<rows.length; i++) { + syncSession.fulltextItems.push({ + libraryID: rows[i].libraryID, + key: rows[i].key + }) + let node = doc.createElement('fulltext'); + node.setAttribute('libraryID', rows[i].libraryID ? rows[i].libraryID : Zotero.libraryID); + node.setAttribute('key', rows[i].key); + node.setAttribute('indexedChars', rows[i].indexedChars); + node.setAttribute('totalChars', rows[i].totalChars); + node.setAttribute('indexedPages', rows[i].indexedPages); + node.setAttribute('totalPages', rows[i].totalPages); + node.appendChild(doc.createTextNode(_xmlize(rows[i].text))); + fulltextsNode.appendChild(node); + } + docElem.appendChild(fulltextsNode); + } + } + // Deletions var deletedNode = doc.createElement('deleted'); var inserted = false; diff --git a/chrome/content/zotero/xpcom/zotero.js b/chrome/content/zotero/xpcom/zotero.js @@ -558,8 +558,6 @@ Components.utils.import("resource://gre/modules/Services.jsm"); Zotero.DB.addCallback('commit', Zotero.Notifier.commit); Zotero.DB.addCallback('rollback', Zotero.Notifier.reset); - Zotero.Fulltext.init(); - // Require >=2.1b3 database to ensure proper locking if (Zotero.isStandalone && Zotero.Schema.getDBVersion('system') > 0 && Zotero.Schema.getDBVersion('system') < 31) { var appStartup = Components.classes["@mozilla.org/toolkit/app-startup;1"] @@ -651,6 +649,8 @@ Components.utils.import("resource://gre/modules/Services.jsm"); } } + Zotero.Fulltext.init(); + Zotero.DB.startDummyStatement(); // Populate combined tables for custom types and fields -- this is likely temporary @@ -2288,6 +2288,17 @@ Zotero.Prefs = new function(){ } break; + // TEMP + case "sync.fulltext.enabled": + if (this.get("sync.fulltext.enabled")) { + // Disable downgrades if full-text sync is enabled, since otherwise + // we could miss full-text content updates + if (Zotero.DB.valueQuery("SELECT version FROM version WHERE schema='userdata'") < 77) { + Zotero.DB.query("UPDATE version SET version=77 WHERE schema='userdata'"); + } + } + break; + case "search.quicksearch-mode": var wm = Components.classes["@mozilla.org/appshell/window-mediator;1"] .getService(Components.interfaces.nsIWindowMediator); diff --git a/chrome/locale/en-US/zotero/preferences.dtd b/chrome/locale/en-US/zotero/preferences.dtd @@ -55,6 +55,8 @@ <!ENTITY zotero.preferences.sync.createAccount "Create Account"> <!ENTITY zotero.preferences.sync.lostPassword "Lost Password?"> <!ENTITY zotero.preferences.sync.syncAutomatically "Sync automatically"> +<!ENTITY zotero.preferences.sync.syncFullTextContent "Sync full-text content"> +<!ENTITY zotero.preferences.sync.syncFullTextContent.desc "Zotero can sync the full-text content of files in your Zotero libraries with zotero.org and other linked devices, allowing you to easily search for your files wherever you are. The full-text content of your files will not be shared publicly."> <!ENTITY zotero.preferences.sync.about "About Syncing"> <!ENTITY zotero.preferences.sync.fileSyncing "File Syncing"> <!ENTITY zotero.preferences.sync.fileSyncing.url "URL:"> diff --git a/chrome/locale/en-US/zotero/zotero.properties b/chrome/locale/en-US/zotero/zotero.properties @@ -27,6 +27,7 @@ general.updateAvailable = Update Available general.upgrade = Upgrade general.yes = Yes general.no = No +general.notNow = Not Now general.passed = Passed general.failed = Failed general.and = and @@ -811,6 +812,11 @@ sync.status.uploadingData = Uploading data to sync server sync.status.uploadAccepted = Upload accepted \u2014 waiting for sync server sync.status.syncingFiles = Syncing files +sync.fulltext.upgradePrompt.title = New: Full-Text Content Syncing +sync.fulltext.upgradePrompt.text = Zotero can now sync the full-text content of files in your Zotero libraries with zotero.org and other linked devices, allowing you to easily search for your files wherever you are. The full-text content of your files will not be shared publicly. +sync.fulltext.upgradePrompt.changeLater = You can change this setting later from the Sync pane of the Zotero preferences. +sync.fulltext.upgradePrompt.enable = Use Full-Text Syncing + sync.storage.mbRemaining = %SMB remaining sync.storage.kbRemaining = %SKB remaining sync.storage.filesRemaining = %1$S/%2$S files diff --git a/defaults/preferences/zotero.js b/defaults/preferences/zotero.js @@ -144,6 +144,7 @@ pref("extensions.zotero.sync.storage.deleteDelayDays", 30); pref("extensions.zotero.sync.storage.groups.enabled", true); pref("extensions.zotero.sync.storage.downloadMode.personal", "on-sync"); pref("extensions.zotero.sync.storage.downloadMode.groups", "on-sync"); +pref("extensions.zotero.sync.fulltext.enabled", true); // Proxy pref("extensions.zotero.proxies.autoRecognize", true); diff --git a/resource/schema/userdata.sql b/resource/schema/userdata.sql @@ -271,6 +271,7 @@ CREATE TABLE fulltextItems ( totalPages INT, indexedChars INT, totalChars INT, + synced INT DEFAULT 0, FOREIGN KEY (itemID) REFERENCES items(itemID) ); CREATE INDEX fulltextItems_version ON fulltextItems(version);