commit 539957a93b7ff3d0c0444cf73dadd4fd38e97f4e
parent ad5ce20c82eb56046efda679ce36d7294d95a53a
Author: Simon Kornblith <simon@simonster.com>
Date: Fri, 8 Sep 2006 20:44:05 +0000
- closes #281, look for BOM when importing to override charset. the BOM is a nice way to detect UTF encodings, although it won't help distinguish, e.g., ISO 8859-1 from MacRoman. since EndNote adds a BOM to all of its export files, this means non-ASCII charaacters should now be preserved when exported from EndNote.
- better error handling for translators ("Could Not Add Item" should now pop up in all circumstances)
Diffstat:
1 file changed, 136 insertions(+), 27 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js
@@ -65,6 +65,7 @@
* _storage - the stored string to be treated as input
* _storageLength - the length of the stored string
* _exportFileDirectory - the directory to which files will be exported
+ * _hasBOM - whether the given file ready to be imported has a BOM or not
*
* WEB-ONLY PRIVATE PROPERTIES:
*
@@ -371,6 +372,9 @@ Scholar.Translate.prototype.setHandler = function(type, handler) {
* itemType - the type of item this scraper says it will scrape
*/
Scholar.Translate.prototype.getTranslators = function() {
+ // clear BOM
+ this._hasBOM = null;
+
if(Scholar.Translate.cache) {
var translators = Scholar.Translate.cache[this.type];
} else {
@@ -445,9 +449,14 @@ Scholar.Translate.prototype._loadTranslator = function() {
try {
Components.utils.evalInSandbox(this.translator[0].code, this._sandbox);
} catch(e) {
- Scholar.debug(e+' in parsing code for '+this.translator[0].label);
- this._translationComplete(false);
- return false;
+ var error = e+' in parsing code for '+this.translator[0].label;
+ if(this._parentTranslator) {
+ throw error;
+ } else {
+ Scholar.debug(error);
+ this._translationComplete(false);
+ return false;
+ }
}
return true;
@@ -459,10 +468,14 @@ Scholar.Translate.prototype._loadTranslator = function() {
Scholar.Translate.prototype.translate = function() {
Scholar.debug("translate called");
+ /*
+ * initialize properties
+ */
this.newItems = new Array();
this.newCollections = new Array();
this._IDMap = new Array();
this._complete = false;
+ this._hasBOM = null;
if(!this.translator || !this.translator.length) {
throw("cannot translate: no translator specified");
@@ -1274,8 +1287,13 @@ Scholar.Translate.prototype._web = function() {
try {
this._sandbox.doWeb(this.document, this.location);
} catch(e) {
- Scholar.debug(e+' in executing code for '+this.translator[0].label);
- return false;
+ var error = e+' in executing code for '+this.translator[0].label;
+ if(this._parentTranslator) {
+ throw error;
+ } else {
+ Scholar.debug();
+ return false;
+ }
}
return true;
@@ -1304,8 +1322,14 @@ Scholar.Translate.prototype._import = function() {
try {
this._sandbox.doImport();
} catch(e) {
- Scholar.debug(e+' in executing code for '+this.translator[0].label);
- return false;
+ Scholar.debug(e.toSource());
+ var error = e+' in executing code for '+this.translator[0].label;
+ if(this._parentTranslator) {
+ throw error;
+ } else {
+ Scholar.debug(error);
+ return false;
+ }
}
return true;
@@ -1370,27 +1394,32 @@ Scholar.Translate.prototype._importConfigureIO = function() {
this._streams.push(this._inputStream);
}
- var intlStream = null;
var filePosition = 0;
-
- // allow translator to set charset
- this._sandbox.Scholar.setCharacterSet = function(charset) {
- // seek
- if(filePosition != 0) {
- me._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
- .seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, filePosition);
- me._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream);
- }
-
- intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
- .createInstance(Components.interfaces.nsIConverterInputStream);
- try {
- intlStream.init(me._inputStream, charset, 1024,
- Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
- } catch(e) {
- throw "Text encoding not supported";
+ var intlStream = this._importDefuseBOM();
+ if(intlStream) {
+ // found a UTF BOM at the beginning of the file; don't allow
+ // translator to set the character set
+ this._sandbox.Scholar.setCharacterSet = function() {}
+ } else {
+ // allow translator to set charset
+ this._sandbox.Scholar.setCharacterSet = function(charset) {
+ // seek
+ if(filePosition != 0) {
+ me._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
+ .seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, filePosition);
+ me._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream);
+ }
+
+ intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
+ .createInstance(Components.interfaces.nsIConverterInputStream);
+ try {
+ intlStream.init(me._inputStream, charset, 1024,
+ Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
+ } catch(e) {
+ throw "Text encoding not supported";
+ }
+ me._streams.push(intlStream);
}
- me._streams.push(intlStream);
}
var str = new Object();
@@ -1398,7 +1427,8 @@ Scholar.Translate.prototype._importConfigureIO = function() {
this._inputStream.QueryInterface(Components.interfaces.nsILineInputStream);
this._sandbox.Scholar.read = function() {
- if(intlStream && intlStream instanceof Components.interfaces.nsIUnicharLineInputStream) {
+ if(intlStream && intlStream instanceof Components.interfaces.nsIUnicharLineInputStream) {
+ Scholar.debug("using intlStream");
var amountRead = intlStream.readLine(str);
} else {
var amountRead = me._inputStream.readLine(str);
@@ -1447,6 +1477,85 @@ Scholar.Translate.prototype._importConfigureIO = function() {
}
/*
+ * searches for a UTF BOM at the beginning of the input stream. if one is found,
+ * returns an appropriate converter-input-stream for the UTF type, and sets
+ * _hasBOM to the UTF type. if one is not found, returns false, and sets
+ * _hasBOM to false to prevent further checking.
+ */
+Scholar.Translate.prototype._importDefuseBOM = function() {
+ // if already found not to have a BOM, skip
+ if(this._hasBOM === false) {
+ return;
+ }
+
+ if(!this._hasBOM) {
+ // if not checked for a BOM, open a binary input stream and read
+ var binStream = Components.classes["@mozilla.org/binaryinputstream;1"].
+ createInstance(Components.interfaces.nsIBinaryInputStream);
+ binStream.setInputStream(this._inputStream);
+
+ // read the first byte
+ var byte1 = binStream.read8();
+
+ // at the moment, we don't support UTF-32 or UTF-7. while mozilla
+ // supports these encodings, they add slight additional complexity to
+ // the function and anyone using them for storing bibliographic metadata
+ // is insane.
+ if(byte1 == 0xEF) { // UTF-8: EF BB BF
+ var byte2 = binStream.read8();
+ if(byte2 == 0xBB) {
+ var byte3 = binStream.read8();
+ if(byte3 == 0xBF) {
+ this._hasBOM = "UTF-8";
+ }
+ }
+ } else if(byte1 == 0xFE) { // UTF-16BE: FE FF
+ var byte2 = binStream.read8();
+ if(byte2 == 0xFF) {
+ this._hasBOM = "UTF-16BE";
+ }
+ } else if(byte1 == 0xFF) { // UTF-16LE: FF FE
+ var byte2 = binStream.read8();
+ if(byte2 == 0xFE) {
+ this._hasBOM = "UTF16-LE";
+ }
+ }
+
+ if(!this._hasBOM) {
+ // seek back to begining of file
+ this._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
+ .seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, 0);
+ this._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream);
+
+ // say there's no BOM
+ this._hasBOM = false;
+
+ return false;
+ }
+ } else {
+ // if it had a BOM the last time, it has one this time, too. seek to the
+ // correct position.
+
+ if(this._hasBOM == "UTF-8") {
+ var seekPosition = 3;
+ } else {
+ var seekPosition = 2;
+ }
+
+ this._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
+ .seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, seekPosition);
+ this._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream);
+ }
+
+ // if we know what kind of BOM it has, generate an input stream
+ intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
+ .createInstance(Components.interfaces.nsIConverterInputStream);
+ intlStream.init(this._inputStream, this._hasBOM, 1024,
+ Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
+ return intlStream;
+}
+
+/*
* does the actual export, after code has been loaded and parsed
*/
Scholar.Translate.prototype._export = function() {