www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit a89388e77a04c914393b171b0819a6ce8c56a918
parent 0abd903917a1a4c3aeb0fade218d574d8d040239
Author: Dan Stillman <dstillman@zotero.org>
Date:   Mon,  4 Nov 2013 04:34:51 -0500

Better handling of maxLength and HTML

- Don't truncate before HTML conversion
- Correctly calculate indexed chars and total chars
- Move HTML conversion code into one function

Diffstat:
Mchrome/content/zotero/xpcom/fulltext.js | 195++++++++++++++++++++++++++++++++++++-------------------------------------------
1 file changed, 88 insertions(+), 107 deletions(-)

diff --git a/chrome/content/zotero/xpcom/fulltext.js b/chrome/content/zotero/xpcom/fulltext.js @@ -54,7 +54,6 @@ Zotero.Fulltext = new function(){ this.clearCacheFiles = clearCacheFiles; //this.clearItemContent = clearItemContent; this.purgeUnusedWords = purgeUnusedWords; - this.HTMLToText = HTMLToText; this.semanticSplitter = semanticSplitter; this.__defineGetter__("pdfToolsDownloadBaseURL", function() { return 'http://www.zotero.org/download/xpdf/'; }); @@ -358,38 +357,25 @@ Zotero.Fulltext = new function(){ return false; } - var text = document.body.innerHTML; - var maxLength = Zotero.Prefs.get('fulltext.textMaxLength'); - if (text.length > maxLength) { + var obj = convertItemHTMLToText(itemID, document.body.innerHTML, maxLength); + var text = obj.text; + var totalChars = obj.totalChars; + + if (totalChars > maxLength) { Zotero.debug('Only indexing first ' + maxLength + ' characters of item ' + itemID + ' in indexDocument()'); - text = text.substr(0, maxLength); } - text = text.replace(/(>)/g, '$1 '); - text = this.HTMLToText(text); this.indexString(text, document.characterSet, itemID); - var charsIndexed = Math.min(maxLength, text.length); - this.setChars(itemID, { indexed: charsIndexed, total: text.length }); - - // Write the converted text to a cache file - Q.fcall(function () { - let cacheFile = self.getItemCacheFile(itemID); - Zotero.debug("Writing converted full-text HTML content to " + cacheFile.path); - if (!cacheFile.parent.exists()) { - Zotero.Attachments.createDirectoryForItem(itemID); - } - return Zotero.File.putContentsAsync(cacheFile, text); - }) - .catch(function (e) { - Zotero.debug(e, 1); - Components.utils.reportError(e); - }) + this.setChars(itemID, { indexed: text.length, total: totalChars }); } - function indexFile(file, mimeType, charset, itemID, maxLength, isCacheFile) { + /** + * @param {Boolean} [complete=FALSE] Index the file in its entirety, ignoring maxLength + */ + function indexFile(file, mimeType, charset, itemID, complete, isCacheFile) { if (!file.exists()){ Zotero.debug('File not found in indexFile()', 2); return false; @@ -402,18 +388,10 @@ Zotero.Fulltext = new function(){ return false; } - if (maxLength == undefined || maxLength === true) { - maxLength = Zotero.Prefs.get('fulltext.textMaxLength'); - } - // If maxLength is explicitly false, index everything - else if (maxLength === false || maxLength === null) { - maxLength = false; - } - if (mimeType == 'application/pdf') { try { Zotero.UnresponsiveScriptIndicator.disable(); - return this.indexPDF(file, itemID, !maxLength); + return this.indexPDF(file, itemID, complete); } finally { Zotero.UnresponsiveScriptIndicator.enable(); @@ -432,29 +410,27 @@ Zotero.Fulltext = new function(){ Zotero.debug('Indexing file ' + file.path); - var text = Zotero.File.getContents(file, charset, maxLength); - // Split elements to avoid word concatentation - text = text.replace(/(>)/g, '$1 '); - text = this.HTMLToText(text); + var text = Zotero.File.getContents(file, charset); + var totalChars = text.length; + var maxLength = complete ? false : Zotero.Prefs.get('fulltext.textMaxLength'); + + if (mimeType == 'text/html') { + let obj = convertItemHTMLToText(itemID, text, maxLength); + text = obj.text; + totalChars = obj.totalChars; + } + else { + if (maxLength && text.length > maxLength) { + text = text.substr(0, maxLength); + } + } + this.indexString(text, charset, itemID); - // Record number of characters indexed + // Record the number of characters indexed (unless we're indexing a (PDF) cache file, + // in which case the stats are coming from elsewhere) if (!isCacheFile) { - try { - var totalChars = this.getTotalCharsFromFile(itemID); - } - catch (e) { - Zotero.debug(e); - Components.utils.reportError(e); - totalChars = 0; - } - if (maxLength) { - var charsIndexed = Math.min(maxLength, totalChars); - } - else { - var charsIndexed = totalChars; - } - this.setChars(itemID, { indexed: charsIndexed, total: totalChars }); + this.setChars(itemID, { indexed: text.length, total: totalChars }); } return true; @@ -550,7 +526,7 @@ Zotero.Fulltext = new function(){ } Zotero.DB.beginTransaction(); - this.indexFile(cacheFile, 'text/plain', 'utf-8', itemID, false, true); + this.indexFile(cacheFile, 'text/plain', 'utf-8', itemID, true, true); this.setPages(itemID, { indexed: pagesIndexed, total: totalPages }); Zotero.DB.commitTransaction(); return true; @@ -581,7 +557,7 @@ Zotero.Fulltext = new function(){ if (ignoreErrors) { try { - this.indexFile(file, item.attachmentMIMEType, item.attachmentCharset, itemID, !complete); + this.indexFile(file, item.attachmentMIMEType, item.attachmentCharset, itemID, complete); } catch (e) { Zotero.debug(e, 1); @@ -590,7 +566,7 @@ Zotero.Fulltext = new function(){ } } else { - this.indexFile(file, item.attachmentMIMEType, item.attachmentCharset, itemID, !complete); + this.indexFile(file, item.attachmentMIMEType, item.attachmentCharset, itemID, complete); } } @@ -646,29 +622,18 @@ Zotero.Fulltext = new function(){ } Zotero.debug("Adding full-text content from file for item " + libraryKey); - text = Zotero.File.getContents(file, item.attachmentCharset, maxLength); + text = Zotero.File.getContents(file, item.attachmentCharset); // If HTML, convert to plain text first, and cache the result if (item.attachmentMIMEType == 'text/html') { - // Split elements to avoid word concatentation - text = text.replace(/(>)/g, '$1 '); - - text = this.HTMLToText(text); - - // Include in the cache file only as many characters as we've indexed - text = text.substr(0, row.indexedChars); - - // Write the converted text to a cache file - Zotero.debug("Writing converted full-text HTML content to " - + cacheFile.path); - if (!cacheFile.parent.exists()) { - Zotero.Attachments.createDirectoryForItem(itemID); - } - Zotero.File.putContentsAsync(cacheFile, text) - .catch(function (e) { - Zotero.debug(e, 1); - Components.utils.reportError(e); - }); + let obj = convertItemHTMLToText( + itemID, + text, + // Include in the cache file only as many characters as we + // indexed previously + row.indexedChars + ); + text = obj.text; } else { // Include only as many characters as we've indexed @@ -1079,31 +1044,17 @@ Zotero.Fulltext = new function(){ } Zotero.debug("Searching for text '" + searchText + "' in " + file.path); - content = Zotero.File.getContents(file, item.attachmentCharset, maxLength); + content = Zotero.File.getContents(file, item.attachmentCharset); // If HTML and not binary mode, convert to text if (mimeType == 'text/html' && !binaryMode) { - // Split elements to avoid word concatentation - content = content.replace(/(>)/g, '$1 '); - - content = this.HTMLToText(content); - // Include in the cache file only as many characters as we've indexed let chars = this.getChars(itemID); - if (chars && chars.indexedChars) { - content = content.substr(0, chars.indexedChars); - } - // Write the converted text to a cache file for future searches - Zotero.debug("Writing converted full-text content to " + cacheFile.path); - if (!cacheFile.parent.exists()) { - Zotero.Attachments.createDirectoryForItem(itemID); - } - Zotero.File.putContentsAsync(cacheFile, content) - .catch(function (e) { - Zotero.debug(e, 1); - Components.utils.reportError(e); - }) + let obj = convertItemHTMLToText( + itemID, content, chars ? chars.indexedChars : null + ); + content = obj.text; } } } @@ -1482,23 +1433,53 @@ Zotero.Fulltext = new function(){ } - function HTMLToText(text){ - var nsIFC = - Components.classes['@mozilla.org/widget/htmlformatconverter;1']. - createInstance(Components.interfaces.nsIFormatConverter); - var from = Components.classes['@mozilla.org/supports-string;1']. - createInstance(Components.interfaces.nsISupportsString); - from.data = text; - var to = {value:null}; + /** + * Convert HTML to text for an item and cache the result + */ + function convertItemHTMLToText(itemID, html, maxLength) { + // Split elements to avoid word concatentation + html = html.replace(/>/g, '> '); + + var text = HTMLToText(html); + var totalChars = text.length; + + if (maxLength) { + text = text.substr(0, maxLength); + } + + // Write the converted text to a cache file + var cacheFile = Zotero.Fulltext.getItemCacheFile(itemID); + Zotero.debug("Writing converted full-text HTML content to " + cacheFile.path); + if (!cacheFile.parent.exists()) { + Zotero.Attachments.createDirectoryForItem(itemID); + } + Zotero.File.putContentsAsync(cacheFile, text) + .catch(function (e) { + Zotero.debug(e, 1); + Components.utils.reportError(e); + }); + + return { + text: text, + totalChars: totalChars + }; + } + + function HTMLToText(html) { + var nsIFC = Components.classes['@mozilla.org/widget/htmlformatconverter;1'] + .createInstance(Components.interfaces.nsIFormatConverter); + var from = Components.classes['@mozilla.org/supports-string;1'] + .createInstance(Components.interfaces.nsISupportsString); + from.data = html; + var to = { value: null }; try { - nsIFC.convert('text/html', from, from.toString().length, - 'text/unicode', to, {}); + nsIFC.convert('text/html', from, from.toString().length, 'text/unicode', to, {}); to = to.value.QueryInterface(Components.interfaces.nsISupportsString); return to.toString(); } - catch(e){ + catch(e) { Zotero.debug(e, 1); - return text; + return html; } }