More full-text indexing tweaks - www - Unnamed repository; edit this file 'description' to name the repository.

commit f9f61fca861e1bc008c4174b06f385f15f7e7fb9
parent 7036d07acca60e2861d4facecb2158b3d47af98d
Author: Dan Stillman <dstillman@zotero.org>
Date:   Thu, 23 Jan 2014 17:30:16 -0500

More full-text indexing tweaks

Follow up from #440

- Convert curly single quotes to straight quotes before inserting
- Add General and Supplemental Unicode punctuation ranges to getClass()
  (so that fancy punctuation doesn't end up in words)
- Move single-quote test from getClass() to semanticSplitter(), and
  consider it a letter only if in the middle of a word
- Add comments to semanticSplitter()

This might be ever-so-slightly slower, but it's neglible. (War and Peace
seems to now take ~1570ms instead of ~1500ms for me.)

Diffstat:
M chrome/content/zotero/xpcom/fulltext.js  | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++++------------

1 file changed, 52 insertions(+), 12 deletions(-)
diff --git a/chrome/content/zotero/xpcom/fulltext.js b/chrome/content/zotero/xpcom/fulltext.js
@@ -145,13 +145,15 @@ Zotero.Fulltext = new function(){
 		if (cc < 0x2E80) { //alphabetical script
 			if ((cc & 0xFF80) == 0) { // ascii
 				if (c == ' '  || c == "\t" || c == "\r" || c == "\n") { return kWbClassSpace; }
-
-				// deviation from Mozilla algorithm: count "'" as an alphaletter
-				if (c == "'" || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { return kWbClassAlphaLetter; }
+				if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')) { return kWbClassAlphaLetter; }
 				return kWbClassPunct;
 			}
 			if ((0xFF80 & cc) == 0x0E00) { return kWbClassThaiLetter; }
 			if (cc == 0x00A0/*NBSP*/) { return kWbClassSpace; }
+			
+			// General and Supplemental Unicode punctuation
+			if ((cc >= 0x2000 && cc <= 0x206f) || (cc >= 0x2e00 && cc <= 0x2e7f)) { return kWbClassPunct; }
+			
 			return kWbClassAlphaLetter;
 		}
 
@@ -1528,7 +1530,7 @@ Zotero.Fulltext = new function(){
 			Zotero.debug('No text to index');
 			return;
 		}
-
+		
 		try {
 			if (charset && charset != 'utf-8') {
 				text = this.decoder.convertStringToUTF8(text, charset, true);
@@ -1542,24 +1544,62 @@ Zotero.Fulltext = new function(){
 		var cclass = null;
 		var strlen = text.length;
 		for (var i = 0; i < strlen; i++) {
-			var c = text.charAt(i);
-			var cc = getClass(c, text.charCodeAt(i));
+			var charCode = text.charCodeAt(i);
+			var cc = null;
+			
+			// Adjustments
+			if (charCode == 8216 || charCode == 8217) {
+				// Curly quotes to straight
+				var c = "'";
+			}
+			else {
+				var c = text.charAt(i);
+			}
+			
+			// Consider single quote in the middle of a word a letter
+			if (c == "'" && word !== '') {
+				cc = kWbClassAlphaLetter;
+			}
 			
+			if (!cc) {
+				cc = getClass(c, charCode);
+			}
+			
+			// When we reach space or punctuation, store the previous word if there is one
 			if (cc == kWbClassSpace || cc == kWbClassPunct) {
-				if (word != '') { words[word] = true; word = ''; }
+				if (word != '') {
+					words[word] = true;
+					word = '';
+				}
+			// When we reach Han character, store previous word and add Han character
 			} else if (cc == kWbClassHanLetter) {
-				if (word != '') { words[word] = true; word = ''; }
+				if (word !== '') {
+					words[word] = true;
+					word = '';
+				}
 				words[c] = true;
+			// Otherwise, if character class hasn't changed, keep adding characters to previous word
 			} else if (cc == cclass) {
 				word += c.toLowerCase();
+			// If character class is different, store previous word and start new word
 			} else {
-				if (word != '') { words[word] = true; }
+				if (word !== '') {
+					words[word] = true;
+				}
 				word = c.toLowerCase();
 			}
 			cclass = cc;
 		}
-		if (word != '') { words[word] = true; }
-
-		return Object.keys(words);
+		if (word !== '') {
+			words[word] = true;
+		}
+		
+		return Object.keys(words).map(function (w) {
+			// Trim trailing single quotes
+			if (w.slice(-1) == "'") {
+				w = w.substr(0, w.length - 1);
+			}
+			return w;
+		});
 	}
 }

	www Unnamed repository; edit this file 'description' to name the repository.
	Log \| Files \| Refs \| Submodules \| README \| LICENSE