commit e01aa91e4930752ebd393cebf3d92656ba868be7
parent 071c05e0bb5e0cded43d1a6dc9f320efd42c287d
Author: Simon Kornblith <simon@simonster.com>
Date: Sun, 12 Dec 2010 07:16:00 +0000
check UTF-8 in separate chunks, since running a regexp on too much data produces an error
Diffstat:
1 file changed, 30 insertions(+), 26 deletions(-)
diff --git a/chrome/content/zotero/xpcom/translation/browser_firefox.js b/chrome/content/zotero/xpcom/translation/browser_firefox.js
@@ -110,15 +110,10 @@ Zotero.Translate.IO.Read = function(file, mode) {
var charset = null;
// look for a BOM in the document
- if(Zotero.isFx4) {
- var first4 = NetUtil.readInputStreamToString(this._rawStream, 4);
- } else {
- var binStream = Components.classes["@mozilla.org/binaryinputstream;1"].
- createInstance(Components.interfaces.nsIBinaryInputStream);
- binStream.setInputStream(this._rawStream);
- var first4 = binStream.readByteArray(4);
- first4 = String.fromCharCode.apply(null, first4);
- }
+ var binStream = Components.classes["@mozilla.org/binaryinputstream;1"].
+ createInstance(Components.interfaces.nsIBinaryInputStream);
+ binStream.setInputStream(this._rawStream);
+ var first4 = binStream.readBytes(4);
for(var possibleCharset in BOMs) {
if(first4.substr(0, BOMs[possibleCharset].length) == BOMs[possibleCharset]) {
@@ -184,6 +179,7 @@ Zotero.Translate.IO.Read = function(file, mode) {
// Check whether the user has specified a charset preference
var charsetPref = Zotero.Prefs.get("import.charset");
if(charsetPref == "auto") {
+ Zotero.debug("Translate: Checking whether file is UTF-8");
// For auto-detect, we are basically going to check if the file could be valid
// UTF-8, and if this is true, we will treat it as UTF-8. Prior likelihood of
// UTF-8 is very high, so this should be a reasonable strategy.
@@ -205,27 +201,35 @@ Zotero.Translate.IO.Read = function(file, mode) {
// Read all currently available bytes from file. I'm not sure how many this is
// but it's a safe bet that we don't want to try to read any more than this, since
// it would slow things down considerably.
- if(Zotero.isFx4) {
- var fileContents = NetUtil.readInputStreamToString(this._rawStream, this._rawStream.available());
- } else {
- var fileContents = binStream.readByteArray(this._rawStream.available());
- fileContents = String.fromCharCode.apply(null, fileContents);
+ this._charset = "UTF-8";
+ let bytesAvailable;
+ while(bytesAvailable = this._rawStream.available()) {
+ // read 131072 bytes
+ let fileContents = binStream.readBytes(Math.min(131072, bytesAvailable));
+
+ // on failure, try reading up to 3 more bytes and see if that makes this
+ // valid (since we have chunked it)
+ let isUTF8;
+ for(let i=1; !(isUTF8 = UTF8Regex.test(fileContents)) && i <= 3; i++) {
+ if(this._rawStream.available()) {
+ fileContents += binStream.readBytes(1);
+ }
+ }
+
+ // if the regexp continues to fail, this is not UTF-8
+ if(!isUTF8) {
+ // Can't be UTF-8; see if a default charset is defined
+ this._charset = Zotero.Prefs.get("intl.charset.default", true);
+
+ // ISO-8859-1 by default
+ if(!this._charset) this._charset = "ISO-8859-1";
+
+ break;
+ }
}
// Seek back to beginning of file
this._seekToStart();
-
- // See whether this could be UTF-8
- if(UTF8Regex.test(fileContents)) {
- // Assume this is UTF-8
- this._charset = "UTF-8";
- } else {
- // Can't be UTF-8; see if a default charset is defined
- this._charset = Zotero.Prefs.get("intl.charset.default", true);
-
- // ISO-8859-1 by default
- if(!this._charset) this._charset = "ISO-8859-1";
- }
} else {
// No need to auto-detect; user has specified a charset
this._charset = charsetPref;