www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit f9f61fca861e1bc008c4174b06f385f15f7e7fb9
parent 7036d07acca60e2861d4facecb2158b3d47af98d
Author: Dan Stillman <dstillman@zotero.org>
Date:   Thu, 23 Jan 2014 17:30:16 -0500

More full-text indexing tweaks

Follow up from #440

- Convert curly single quotes to straight quotes before inserting
- Add General and Supplemental Unicode punctuation ranges to getClass()
  (so that fancy punctuation doesn't end up in words)
- Move single-quote test from getClass() to semanticSplitter(), and
  consider it a letter only if in the middle of a word
- Add comments to semanticSplitter()

This might be ever-so-slightly slower, but it's neglible. (War and Peace
seems to now take ~1570ms instead of ~1500ms for me.)

Diffstat:
Mchrome/content/zotero/xpcom/fulltext.js | 64++++++++++++++++++++++++++++++++++++++++++++++++++++------------
1 file changed, 52 insertions(+), 12 deletions(-)

diff --git a/chrome/content/zotero/xpcom/fulltext.js b/chrome/content/zotero/xpcom/fulltext.js @@ -145,13 +145,15 @@ Zotero.Fulltext = new function(){ if (cc < 0x2E80) { //alphabetical script if ((cc & 0xFF80) == 0) { // ascii if (c == ' ' || c == "\t" || c == "\r" || c == "\n") { return kWbClassSpace; } - - // deviation from Mozilla algorithm: count "'" as an alphaletter - if (c == "'" || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { return kWbClassAlphaLetter; } + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { return kWbClassAlphaLetter; } return kWbClassPunct; } if ((0xFF80 & cc) == 0x0E00) { return kWbClassThaiLetter; } if (cc == 0x00A0/*NBSP*/) { return kWbClassSpace; } + + // General and Supplemental Unicode punctuation + if ((cc >= 0x2000 && cc <= 0x206f) || (cc >= 0x2e00 && cc <= 0x2e7f)) { return kWbClassPunct; } + return kWbClassAlphaLetter; } @@ -1528,7 +1530,7 @@ Zotero.Fulltext = new function(){ Zotero.debug('No text to index'); return; } - + try { if (charset && charset != 'utf-8') { text = this.decoder.convertStringToUTF8(text, charset, true); @@ -1542,24 +1544,62 @@ Zotero.Fulltext = new function(){ var cclass = null; var strlen = text.length; for (var i = 0; i < strlen; i++) { - var c = text.charAt(i); - var cc = getClass(c, text.charCodeAt(i)); + var charCode = text.charCodeAt(i); + var cc = null; + + // Adjustments + if (charCode == 8216 || charCode == 8217) { + // Curly quotes to straight + var c = "'"; + } + else { + var c = text.charAt(i); + } + + // Consider single quote in the middle of a word a letter + if (c == "'" && word !== '') { + cc = kWbClassAlphaLetter; + } + if (!cc) { + cc = getClass(c, charCode); + } + + // When we reach space or punctuation, store the previous word if there is one if (cc == kWbClassSpace || cc == kWbClassPunct) { - if (word != '') { words[word] = true; word = ''; } + if (word != '') { + words[word] = true; + word = ''; + } + // When we reach Han character, store previous word and add Han character } else if (cc == kWbClassHanLetter) { - if (word != '') { words[word] = true; word = ''; } + if (word !== '') { + words[word] = true; + word = ''; + } words[c] = true; + // Otherwise, if character class hasn't changed, keep adding characters to previous word } else if (cc == cclass) { word += c.toLowerCase(); + // If character class is different, store previous word and start new word } else { - if (word != '') { words[word] = true; } + if (word !== '') { + words[word] = true; + } word = c.toLowerCase(); } cclass = cc; } - if (word != '') { words[word] = true; } - - return Object.keys(words); + if (word !== '') { + words[word] = true; + } + + return Object.keys(words).map(function (w) { + // Trim trailing single quotes + if (w.slice(-1) == "'") { + w = w.substr(0, w.length - 1); + } + return w; + }); } }