commit 045780d9acf2c2ca7cac4ac5209bdf6bc8c8a681
parent cec35d75668d861174c6bfc636db1f8ae34d1463
Author: Simon Kornblith <simon@simonster.com>
Date: Tue, 5 Sep 2006 07:51:55 +0000
closes #250, figure out proper text encodings for import/export
MODS uses the encoding as specified in the <?xml tag, or else UTF-8
RIS uses IBM850, since the spec says "IBM Extended Character Set" and it's the only code page Mozilla supports. (should I do this? or just use unicode?)
MARC uses UTF-8, since I don't think there's any way to get full MARC-8 support, and UTF-8 is now the preferred encoding anyway
Diffstat:
3 files changed, 175 insertions(+), 56 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/fileInterface.js b/chrome/chromeFiles/content/scholar/fileInterface.js
@@ -321,7 +321,7 @@ var Scholar_File_Interface = new function() {
// create UTF-8 output stream
var os = Components.classes["@mozilla.org/intl/converter-output-stream;1"].
createInstance(Components.interfaces.nsIConverterOutputStream);
- os.init(fStream, "UTF-8", 0, "?");
+ os.init(fStream, "UTF-8", 0, "?".charCodeAt(0));
os.writeString(html);
diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js
@@ -389,6 +389,8 @@ Scholar.Translate.prototype.getTranslators = function() {
// see which translators can translate
var possibleTranslators = this._findTranslators(translators);
+ this._closeStreams();
+
return possibleTranslators;
}
@@ -647,7 +649,13 @@ Scholar.Translate.prototype._generateSandbox = function() {
/*
* Check to see if _scraper_ can scrape this document
*/
-Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtensions) {
+Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtensions) {
+ if((this.type == "import" || this.type == "web") && !this.location) {
+ // if no location yet (e.g., getting list of possible web translators),
+ // just return true
+ return true;
+ }
+
// Test location with regular expression
if(translator.target && (this.type == "import" || this.type == "web")) {
var canTranslate = false;
@@ -662,9 +670,10 @@ Scholar.Translate.prototype._canTranslate = function(translator, ignoreExtension
if(translator.importRegexp) {
var regularExpression = translator.importRegexp;
} else {
- var regularExpression = new RegExp("\."+translator.target+"$", "i");
+ var regularExpression = new RegExp("\\."+translator.target+"$", "i");
}
}
+ Scholar.debug("path is "+this.path);
if(regularExpression.test(this.path)) {
canTranslate = true;
@@ -861,23 +870,6 @@ Scholar.Translate.prototype._translationComplete = function(returnValue) {
} else {
Scholar.debug("translation complete");
- // serialize RDF and unregister dataSource
- if(this._rdf) {
- if(this._rdf.serializer) {
- this._rdf.serializer.Serialize(this._streams[0]);
- }
-
- try {
- var rdfService = Components.classes["@mozilla.org/rdf/rdf-service;1"].
- getService(Components.interfaces.nsIRDFService);
- rdfService.UnregisterDataSource(this._rdf.dataSource);
- } catch(e) {
- Scholar.debug("could not unregister data source");
- }
-
- delete this._rdf.dataSource;
- }
-
// close open streams
this._closeStreams();
@@ -902,6 +894,21 @@ Scholar.Translate.prototype._translationComplete = function(returnValue) {
* closes open file streams, if any exist
*/
Scholar.Translate.prototype._closeStreams = function() {
+ // serialize RDF and unregister dataSource
+ if(this._rdf) {
+ if(this._rdf.serializer) {
+ this._rdf.serializer.Serialize(this._streams[0]);
+ }
+
+ try {
+ var rdfService = Components.classes["@mozilla.org/rdf/rdf-service;1"].
+ getService(Components.interfaces.nsIRDFService);
+ rdfService.UnregisterDataSource(this._rdf.dataSource);
+ } catch(e) {}
+
+ delete this._rdf.dataSource;
+ }
+
if(this._streams.length) {
for(var i in this._streams) {
var stream = this._streams[i];
@@ -924,8 +931,10 @@ Scholar.Translate.prototype._closeStreams = function() {
}
}
}
+
delete this._streams;
this._streams = new Array();
+ this._inputStream = null;
}
/*
@@ -1327,49 +1336,105 @@ Scholar.Translate.prototype._importConfigureIO = function() {
this._storagePointer = 0;
}
} else {
+ var me = this;
+
if(this._configOptions.dataMode == "rdf") {
- this._rdf = new Object()
-
- var IOService = Components.classes['@mozilla.org/network/io-service;1']
- .getService(Components.interfaces.nsIIOService);
- var fileHandler = IOService.getProtocolHandler("file")
- .QueryInterface(Components.interfaces.nsIFileProtocolHandler);
- var URL = fileHandler.getURLSpecFromFile(this.location);
-
- var RDFService = Components.classes['@mozilla.org/rdf/rdf-service;1']
- .getService(Components.interfaces.nsIRDFService);
- this._rdf.dataSource = RDFService.GetDataSourceBlocking(URL);
-
- // make an instance of the RDF handler
- this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(this._rdf.dataSource);
+ if(!this._rdf) {
+ this._rdf = new Object()
+
+ var IOService = Components.classes['@mozilla.org/network/io-service;1']
+ .getService(Components.interfaces.nsIIOService);
+ var fileHandler = IOService.getProtocolHandler("file")
+ .QueryInterface(Components.interfaces.nsIFileProtocolHandler);
+ var URL = fileHandler.getURLSpecFromFile(this.location);
+
+ var RDFService = Components.classes['@mozilla.org/rdf/rdf-service;1']
+ .getService(Components.interfaces.nsIRDFService);
+ this._rdf.dataSource = RDFService.GetDataSourceBlocking(URL);
+
+ // make an instance of the RDF handler
+ this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(this._rdf.dataSource);
+ }
} else {
// open file and set read methods
- var fStream = Components.classes["@mozilla.org/network/file-input-stream;1"]
- .createInstance(Components.interfaces.nsIFileInputStream);
- fStream.init(this.location, 0x01, 0664, 0);
- this._streams.push(fStream);
+ if(this._inputStream) {
+ this._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
+ .seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, 0);
+ this._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream);
+ } else {
+ this._inputStream = Components.classes["@mozilla.org/network/file-input-stream;1"]
+ .createInstance(Components.interfaces.nsIFileInputStream);
+ this._inputStream.init(this.location, 0x01, 0664, 0);
+ this._streams.push(this._inputStream);
+ }
- if(this._configOptions.dataMode == "line") { // line by line reading
- var notEof = true;
- var lineData = new Object();
+ var intlStream = null;
+ var filePosition = 0;
+
+ // allow translator to set charset
+ this._sandbox.Scholar.setCharacterSet = function(charset) {
+ // seek
+ if(filePosition != 0) {
+ me._inputStream.QueryInterface(Components.interfaces.nsISeekableStream)
+ .seek(Components.interfaces.nsISeekableStream.NS_SEEK_SET, filePosition);
+ me._inputStream.QueryInterface(Components.interfaces.nsIFileInputStream);
+ }
- fStream.QueryInterface(Components.interfaces.nsILineInputStream);
+ intlStream = Components.classes["@mozilla.org/intl/converter-input-stream;1"]
+ .createInstance(Components.interfaces.nsIConverterInputStream);
+ try {
+ intlStream.init(me._inputStream, charset, 1024,
+ Components.interfaces.nsIConverterInputStream.DEFAULT_REPLACEMENT_CHARACTER);
+ } catch(e) {
+ throw "Text encoding not supported";
+ }
+ me._streams.push(intlStream);
+ }
+
+ var str = new Object();
+ if(this._configOptions.dataMode == "line") { // line by line reading
+ this._inputStream.QueryInterface(Components.interfaces.nsILineInputStream);
this._sandbox.Scholar.read = function() {
- if(notEof) {
- notEof = fStream.readLine(lineData);
- return lineData.value;
+ if(intlStream && intlStream instanceof Components.interfaces.nsIUnicharLineInputStream) {
+ var amountRead = intlStream.readLine(str);
+ } else {
+ var amountRead = me._inputStream.readLine(str);
+ }
+ if(amountRead) {
+ filePosition += amountRead;
+ return str.value;
} else {
return false;
}
}
} else { // block reading
- var sStream = Components.classes["@mozilla.org/scriptableinputstream;1"]
- .createInstance(Components.interfaces.nsIScriptableInputStream);
- sStream.init(fStream);
+ var sStream;
this._sandbox.Scholar.read = function(amount) {
- return sStream.read(amount);
+ if(intlStream) {
+ // read from international stream, if one is available
+ var amountRead = intlStream.readString(amount, str);
+
+ if(amountRead) {
+ filePosition += amountRead;
+ return str.value;
+ } else {
+ return false;
+ }
+ } else {
+ // allocate sStream on the fly
+ if(!sStream) {
+ sStream = Components.classes["@mozilla.org/scriptableinputstream;1"]
+ .createInstance(Components.interfaces.nsIScriptableInputStream);
+ sStream.init(me._inputStream);
+ }
+
+ // read from the scriptable input stream
+ var string = sStream.read(amount);
+ filePosition += string.length;
+ return string;
+ }
}
// attach sStream to stack of streams to close
@@ -1473,8 +1538,24 @@ Scholar.Translate.prototype._exportConfigureIO = function() {
// make an instance of the RDF handler
this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(this._rdf.dataSource, this._rdf.serializer);
- } else { // regular io; write just writes to file
- this._sandbox.Scholar.write = function(data) { fStream.write(data, data.length) };
+ } else {
+ // regular io; write just writes to file
+ var intlStream = null;
+
+ // allow setting of character sets
+ this._sandbox.Scholar.setCharacterSet = function(charset) {
+ intlStream = Components.classes["@mozilla.org/intl/converter-output-stream;1"]
+ .createInstance(Components.interfaces.nsIConverterOutputStream);
+ intlStream.init(fStream, charset, 1024, "?".charCodeAt(0));
+ };
+
+ this._sandbox.Scholar.write = function(data) {
+ if(intlStream) {
+ intlStream.writeString(data);
+ } else {
+ fStream.write(data, data.length);
+ }
+ };
}
}
@@ -1628,6 +1709,10 @@ Scholar.Translate.prototype._initializeInternalIO = function() {
*/
Scholar.Translate.prototype._storageFunctions = function(read, write) {
var me = this;
+
+ // add setCharacterSet method that does nothing
+ this._sandbox.Scholar.setCharacterSet = function() {}
+
if(write) {
// set up write() method
this._sandbox.Scholar.write = function(data) {
diff --git a/scrapers.sql b/scrapers.sql
@@ -1,4 +1,4 @@
--- 79
+-- 80
-- Set the following timestamp to the most recent scraper update date
REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-31 22:44:00'));
@@ -3840,6 +3840,7 @@ function detectImport() {
'var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"];
function doExport() {
+ Scholar.setCharacterSet("utf-8");
var modsCollection = <modsCollection xmlns="http://www.loc.gov/mods/v3" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-2.xsd" />;
var item;
@@ -4115,15 +4116,39 @@ function doImport() {
var text = "";
var read;
+ // read until we see if the file begins with a parse instruction
+ read = " ";
+ while(read == " " || read == "\n" || read == "\r") {
+ read = Scholar.read(1);
+ }
+
+ var firstPart = read + Scholar.read(4);
+ if(firstPart == "<?xml") {
+ // got a parse instruction, read until it ends
+ read = true;
+ while((read !== false) && (read !== ">")) {
+ read = Scholar.read(1);
+ firstPart += read;
+ }
+ var encodingRe = /encoding=[''"]([^''"]+)[''"]/;
+ var m = encodingRe.exec(firstPart);
+ // set character set
+ try {
+ Scholar.setCharacterSet(m[1]);
+ } catch(e) {
+ Scholar.setCharacterSet("utf-8");
+ }
+ } else {
+ Scholar.setCharacterSet("utf-8");
+ text += firstPart;
+ }
+
// read in 16384 byte increments
while(read = Scholar.read(16384)) {
text += read;
}
Scholar.Utilities.debug("read in");
- // eliminate <?xml ?> heading so we can parse as XML
- text = text.replace(/<\?xml[^?]+\?>/, "");
-
// parse with E4X
var m = new Namespace("http://www.loc.gov/mods/v3");
// why does this default namespace declaration not work!?
@@ -5495,7 +5520,9 @@ function processTag(item, tag, value) {
}
function doImport(attachments) {
- Scholar.Utilities.debug("hello");
+ // this is apparently the proper character set for RIS, although i''m not
+ // sure how many people follow this
+ Scholar.setCharacterSet("IBM850");
var line = true;
var tag = data = false;
@@ -5560,6 +5587,10 @@ function addTag(tag, value) {
}
function doExport() {
+ // this is apparently the proper character set for RIS, although i''m not
+ // sure how many people follow this
+ Scholar.setCharacterSet("IBM850");
+
var item;
while(item = Scholar.nextItem()) {
@@ -5974,6 +6005,9 @@ function doImport() {
var text;
var holdOver = ""; // part of the text held over from the last loop
+ Scholar.Utilities.debug("doing import: about to set character set");
+ Scholar.setCharacterSet("utf-8");
+
while(text = Scholar.read(4096)) { // read in 4096 byte increments
var records = text.split("\x1D");