commit 152c9bf9e7ee421d8e56db29adef7680b392c096
parent 6c55e63eab9ea6b538105b76180aae3de6bc4270
Author: Simon Kornblith <simon@simonster.com>
Date: Tue, 6 Jun 2006 18:25:45 +0000
- Small changes to MARC record support
- Implemented loadDocument API, for loading and parsing the DOMs of HTML documents in the background
- Added scraper code to SVN repository (now includes 12 scrapers, see Writeboard for details)
To update to the latest versions of all scrapers, ensure you have an up-to-date version of sqlite3, then run:
sqlite3 ~/Library/Application\ Support/Firefox/Profiles/profileName/scholar.sqlite < scrapers.sql
Diffstat:
5 files changed, 1206 insertions(+), 85 deletions(-)
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js
@@ -35,6 +35,7 @@ Scholar.Ingester.Interface.init = function() {
*/
Scholar.Ingester.Interface.chromeLoad = function() {
Scholar.Ingester.Interface.tabBrowser = document.getElementById("content");
+ Scholar.Ingester.Interface.hiddenBrowser = document.getElementById("scholar-hidden-browser");
Scholar.Ingester.Interface.appContent = document.getElementById("appcontent");
Scholar.Ingester.Interface.statusImage = document.getElementById("scholar-status-image");
@@ -189,7 +190,7 @@ Scholar.Ingester.Interface._setDocument = function(browser) {
browser.setAttribute("scholar-key", key);
}
}
- Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser);
+ Scholar.Ingester.Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, Scholar.Ingester.Interface.hiddenBrowser);
Scholar.Ingester.Interface.browserDocuments[key].retrieveScraper();
}
diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.xul b/chrome/chromeFiles/content/scholar/ingester/browser.xul
@@ -19,4 +19,7 @@
<image id="scholar-status-image" width="16" height="16" onclick="Scholar.Ingester.Interface.scrapeThisPage()" />
</statusbarpanel>
</statusbar>
+ <box style="visibility: collapse">
+ <browser id="scholar-hidden-browser" />
+ </box>
</overlay>
diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js
@@ -48,7 +48,9 @@ Scholar.Ingester.Model.prototype.detachRepository = function() {}
/////////////////////////////////////////////////////////////////
// Scholar.Ingester.Utilities class, a set of methods to assist in data
// extraction. Most code here was stolen directly from the Piggy Bank project.
-Scholar.Ingester.Utilities = function() {}
+Scholar.Ingester.Utilities = function(hiddenBrowser) {
+ this.hiddenBrowser = hiddenBrowser;
+}
// Adapter for Piggy Bank function to print debug messages; log level is
// fixed at 4 (could change this)
@@ -99,6 +101,7 @@ Scholar.Ingester.Utilities.prototype.gatherElementsOnXPath = function(doc, paren
// Loads a single document for a scraper, running succeeded() on success or
// failed() on failure
Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succeeded, failed) {
+ Scholar.debug("loadDocument called");
this.processDocuments(browser, null, [ url ], succeeded, function() {}, failed);
}
@@ -112,6 +115,9 @@ Scholar.Ingester.Utilities.prototype.loadDocument = function(url, browser, succe
// exception - a function to execute if an exception occurs (exceptions are
// also logged in the Firefox Scholar log)
Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) {
+ var hiddenBrowser = this.hiddenBrowser;
+ Scholar.debug("processDocuments called");
+
try {
if (urls.length == 0) {
if (firstDoc) {
@@ -128,53 +134,51 @@ Scholar.Ingester.Utilities.prototype.processDocuments = function(browser, firstD
if (urlIndex < urls.length) {
try {
var url = urls[urlIndex];
- var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
- b.loadURI(url);
+ Scholar.debug("loading "+url);
+ hiddenBrowser.loadURI(url);
} catch (e) {
- exception(e);
Scholar.debug("Scholar.Ingester.Utilities.processDocuments doLoad: " + e, 2);
+ exception(e);
}
} else {
- window.setTimeout(done, 10);
+ hiddenBrowser.setTimeout(done, 10);
}
};
var onLoad = function() {
- try {
- var b = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser").selectedBrowser;
- processor(b.contentDocument, doLoad);
- } catch (e) {
- exception(e);
- Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
+ Scholar.debug("onLoad called");
+ if(hiddenBrowser.id == "scholar-hidden-browser") {
+ hiddenBrowser.removeEventListener("DOMContentLoaded", onLoad, true);
+ try {
+ var newHiddenBrowser = new Object();
+ Scholar.debug("new hidden browser");
+ newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument;
+ newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow;
+ Scholar.debug("added attributes");
+ processor(newHiddenBrowser);
+ Scholar.debug("called processor");
+ } catch (e) {
+ Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLoad: " + e, 2);
+ exception(e);
+ }
}
};
var init = function() {
- var listener;
- listener.onStateChange = function(webProgress, request, stateFlags, status) {
- if ((stateFlags & Components.interfaces.nsIWebProgressListener.STATE_STOP) > 0 &&
- request.name == urls[urlIndex]) {
- try {
- Scholar.Ingester.progressDialog.setTimeout(onLoad, 10);
- } catch (e) {
- exception(e);
- Scholar.debug("Scholar.Ingester.Utilities.processDocuments onLocationChange: " + e, 2);
- }
- }
- };
-
- var tb = Scholar.Ingester.progressDialog.document.getElementById("hidden-browser");
- tb.addProgressListener(listener, Components.interfaces.nsIWebProgress.NOTIFY_STATUS);
+ Scholar.debug("init called");
+ hiddenBrowser.addEventListener("DOMContentLoaded", onLoad, true);
if (firstDoc) {
+ Scholar.debug("processing");
processor(firstDoc, doLoad);
} else {
+ Scholar.debug("doing load");
doLoad();
}
}
- w.addEventListener("load", init, false);
+ init();
} catch (e) {
+ Scholar.debug("processDocuments: " + e);
exception(e);
- PB_Debug.print("processDocuments: " + e);
}
}
@@ -209,12 +213,18 @@ Scholar.Ingester.Utilities.prototype.collectURLsWithSubstring = function(doc, su
// break compatibility
Scholar.Ingester.Utilities.prototype._MARCCleanString = function(author) {
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
- return author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+ author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+ return author.replace(/ +/, ' ');
}
Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
- author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+ author = author.replace(/[\s\,\/\[\]\:\.]+$/, '');
+ author = author.replace(/ +/, ' ');
+ // Add period for initials
+ if(author.substring(author.length-2, author.length-1) == " ") {
+ author += ".";
+ }
var splitNames = author.split(', ');
if(splitNames.length > 1) {
author = splitNames[1]+' '+splitNames[0];
@@ -222,6 +232,16 @@ Scholar.Ingester.Utilities.prototype._MARCCleanAuthor = function(author) {
return author;
}
+Scholar.Ingester.Utilities.prototype._MARCCleanNumber = function(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '');
+ author = author.replace(/[\s\.\,\/\[\]\:]+$/, '');
+ var regexp = /^[^ ]*/;
+ var m = regexp.exec(author);
+ if(m) {
+ return m[0];
+ }
+}
+
Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) {
if(!part) {
part = 'a';
@@ -253,27 +273,29 @@ Scholar.Ingester.Utilities.prototype._MARCAssociateField = function(record, uri,
// This is an extension to PiggyBank's architecture. It's here so that we don't
// need an enormous library for each scraper that wants to use MARC records
-Scholar.Ingester.Utilities.prototype.importMARCRecord = function(text, format, uri, model) {
+Scholar.Ingester.Utilities.prototype.importMARCRecord = function(record, uri, model) {
var prefixDC = 'http://purl.org/dc/elements/1.1/';
var prefixDCMI = 'http://purl.org/dc/dcmitype/';
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
- var record = new Scholar.Ingester.MARC_Record();
- record.load(text, format);
-
// Extract ISBNs
- model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanString, 'ISBN ');
+ model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN ');
// Extract ISSNs
- model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanString, 'ISBN ');
+ model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN ');
// Extract creators
model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this._MARCCleanAuthor);
model = this._MARCAssociateField(record, uri, model, '110', prefixDC + 'creator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '111', prefixDC + 'creator', this._MARCCleanString);
model = this._MARCAssociateField(record, uri, model, '130', prefixDC + 'creator', this._MARCCleanString);
- if(!model.data[uri][prefixDC + 'creator']) {
+ model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this._MARCCleanAuthor);
+ model = this._MARCAssociateField(record, uri, model, '710', prefixDC + 'contributor', this._MARCCleanString);
+ model = this._MARCAssociateField(record, uri, model, '711', prefixDC + 'contributor', this._MARCCleanString);
+ model = this._MARCAssociateField(record, uri, model, '730', prefixDC + 'contributor', this._MARCCleanString);
+ if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'])) { // some LOC entries have no listed author, but have the author
+ // in the person subject field as the first entry
var field = record.get_field_subfields('600');
- if(field) {
- model = this.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a']));
+ if(field[0]) {
+ model.addStatement(uri, prefixDC + 'creator', this._MARCCleanAuthor(field[0]['a']));
}
}
// Extract title
@@ -403,12 +425,13 @@ Scholar.Ingester.HTTPUtilities.prototype.stateChange = function(xmlhttp, onStatu
/*
* Constructor for Document object
*/
-Scholar.Ingester.Document = function(browserWindow){
+Scholar.Ingester.Document = function(browserWindow, hiddenBrowser){
this.browser = browserWindow;
+ this.model = new Scholar.Ingester.Model();
this.appSvc = Cc["@mozilla.org/appshell/appShellService;1"]
.getService(Ci.nsIAppShellService);
- this.scraper = null
- this.model = new Scholar.Ingester.Model();
+ this.scraper = null;
+ this.hiddenBrowser = hiddenBrowser;
this._generateSandbox();
}
@@ -530,11 +553,13 @@ Scholar.Ingester.Document.prototype._generateSandbox = function() {
this.sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href);
this.sandbox.browser = this.browser;
this.sandbox.doc = this.sandbox.browser.contentDocument;
- this.sandbox.utilities = new Scholar.Ingester.Utilities;
+ this.sandbox.utilities = new Scholar.Ingester.Utilities(this.hiddenBrowser);
this.sandbox.utilities.HTTPUtilities = new Scholar.Ingester.HTTPUtilities(this.appSvc.hiddenDOMWindow);
this.sandbox.window = this.window;
this.sandbox.model = this.model;
this.sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult;
+ this.sandbox.MARC_Record = Scholar.Ingester.MARC_Record;
+ this.sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record();
var me = this;
this.sandbox.wait = function(){ me._waitForCompletion = true; };
@@ -552,50 +577,90 @@ Scholar.Ingester.Document.prototype._updateDatabase = function() {
var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/';
for(var uri in this.model.data) {
- var newItem = Scholar.Items.getNewItemByType(1);
+ if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
+ var newItem = Scholar.Items.getNewItemByType(2);
+ } else {
+ var newItem = Scholar.Items.getNewItemByType(1);
+ }
newItem.setField("source", uri);
if(this.model.data[uri][prefixDC + 'title']) {
newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]);
}
- if(this.model.data[uri][prefixDC + 'publisher']) {
- newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
- }
- if(this.model.data[uri][prefixDC + 'year']) {
- if(this.model.data[uri][prefixDC + 'year'].length == 4) {
- newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
- } else {
- try {
- newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring(
- this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1,
- this.model.data[uri][prefixDC + 'year'][0].length));
- } catch(e) {}
- }
- }
- if(this.model.data[uri][prefixDC + 'edition']) {
- newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]);
- }
- if(this.model.data[uri][prefixDummy + 'series']) {
- newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
- }
- if(this.model.data[uri][prefixDummy + 'place']) {
- newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
- }
- if(this.model.data[uri][prefixDC + 'identifier']) {
- for(i in this.model.data[uri][prefixDC + 'identifier']) {
- if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
- newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
- break;
- }
- }
- }
+ var creatorIndex = 0;
if(this.model.data[uri][prefixDC + 'creator']) {
for(i in this.model.data[uri][prefixDC + 'creator']) {
var creator = this.model.data[uri][prefixDC + 'creator'][i];
var spaceIndex = creator.lastIndexOf(" ");
var lastName = creator.substring(spaceIndex+1, creator.length);
var firstName = creator.substring(0, spaceIndex);
+
+ newItem.setCreator(creatorIndex, firstName, lastName, 1);
+ creatorIndex++;
+ }
+ }
+ if(this.model.data[uri][prefixDC + 'contributor']) {
+ for(i in this.model.data[uri][prefixDC + 'contributor']) {
+ var creator = this.model.data[uri][prefixDC + 'contributor'][i];
+ var spaceIndex = creator.lastIndexOf(" ");
+ var lastName = creator.substring(spaceIndex+1, creator.length);
+ var firstName = creator.substring(0, spaceIndex);
- newItem.setCreator(i, firstName, lastName);
+ newItem.setCreator(creatorIndex, firstName, lastName, 2);
+ creatorIndex++;
+ }
+ }
+ if(this.model.data[uri][prefixRDF + 'type'] == (prefixDummy + 'journal')) {
+ if(this.model.data[uri][prefixDummy + 'publication']) {
+ newItem.setField("publication", this.model.data[uri][prefixDummy + 'publication'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'volume']) {
+ newItem.setField("volume", this.model.data[uri][prefixDummy + 'volume'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'number']) {
+ newItem.setField("number", this.model.data[uri][prefixDummy + 'number'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'pages']) {
+ newItem.setField("pages", this.model.data[uri][prefixDummy + 'pages'][0]);
+ }
+ if(this.model.data[uri][prefixDC + 'identifier']) {
+ for(i in this.model.data[uri][prefixDC + 'identifier']) {
+ if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISSN') {
+ newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
+ break;
+ }
+ }
+ }
+ } else {
+ if(this.model.data[uri][prefixDC + 'publisher']) {
+ newItem.setField("publisher", this.model.data[uri][prefixDC + 'publisher'][0]);
+ }
+ if(this.model.data[uri][prefixDC + 'year']) {
+ if(this.model.data[uri][prefixDC + 'year'].length == 4) {
+ newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]);
+ } else {
+ try {
+ newItem.setField(this.model.data[uri][prefixDC + 'year'][0].substring(
+ this.model.data[uri][prefixDC + 'year'][0].lastIndexOf(" ")+1,
+ this.model.data[uri][prefixDC + 'year'][0].length));
+ } catch(e) {}
+ }
+ }
+ if(this.model.data[uri][prefixDC + 'edition']) {
+ newItem.setField("edition", this.model.data[uri][prefixDC + 'edition'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'series']) {
+ newItem.setField("series", this.model.data[uri][prefixDummy + 'series'][0]);
+ }
+ if(this.model.data[uri][prefixDummy + 'place']) {
+ newItem.setField("place", this.model.data[uri][prefixDummy + 'place'][0]);
+ }
+ if(this.model.data[uri][prefixDC + 'identifier']) {
+ for(i in this.model.data[uri][prefixDC + 'identifier']) {
+ if(this.model.data[uri][prefixDC + 'identifier'][i].substring(0, 4) == 'ISBN') {
+ newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][0].substring(5));
+ break;
+ }
+ }
}
}
newItem.save();
diff --git a/chrome/chromeFiles/content/scholar/xpcom/marc.js b/chrome/chromeFiles/content/scholar/xpcom/marc.js
@@ -80,8 +80,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
}
this.add_field(tag,ind1,ind2,value);
}
- }
- if (f == 'MARC_Harvard') {
+ } else if (f == 'MARC_Harvard') {
var linee = s.split('\n');
for (var i=0; i<linee.length; i++) {
linee[i] = this._trim(linee[i]);
@@ -128,8 +127,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
}
}
this.add_field_005();
- }
- if (f == 'MARC_BNI') {
+ } else if (f == 'MARC_BNI') {
var linee = s.split('\n');
for (var i=0; i<linee.length; i++) {
linee[i] = this._trim(linee[i]);
@@ -167,8 +165,7 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
}
}
this.add_field_005();
- }
- if (f == 'MARC_Loc') { // MARC copiato dal browser dal sito catalog.loc.gov
+ } else if (f == 'MARC_Loc') { // MARC copiato dal browser dal sito catalog.loc.gov
var linee = s.split('\n');
for (var i=0; i<linee.length; i++) {
linee[i] = this._trim(linee[i]);
@@ -209,6 +206,46 @@ Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s
}
}
this.add_field_005();
+ } else if (f == 'MARC_PAC') {
+ var linee = s.split('\n');
+ for (var i=0; i<linee.length; i++) {
+ linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0
+ linee[i] = linee[i].replace(/_/g,' ');
+ linee[i] = linee[i].replace(/\t/g,'');
+ linee[i] = this._trim(linee[i]);
+ if (linee[i] == '') continue; // jumps empty lines
+ var replacer = this.subfield_delimiter+'$1';
+ linee[i] = linee[i].replace(/\|(.)/g,replacer);
+ linee[i] = linee[i].replace(/\|/g,this.subfield_delimiter);
+ var tag = linee[i].substr(0,3);
+ var ind1 = linee[i].substr(4,1);
+ var ind2 = linee[i].substr(5,1);
+ var value = this.subfield_delimiter+'a'+linee[i].substr(7);
+ if(linee[i].substr(0, 6) == "LEADER") {
+ value = linee[i].substr(7);
+ this.leader.record_length = '00000';
+ this.leader.record_status = value.substr(5,1);
+ this.leader.type_of_record = value.substr(6,1);
+ this.leader.bibliographic_level = value.substr(7,1);
+ this.leader.type_of_control = value.substr(8,1);
+ this.leader.character_coding_scheme = value.substr(9,1);
+ this.leader.indicator_count = '2';
+ this.leader.subfield_code_length = '2';
+ this.leader.base_address_of_data = '00000';
+ this.leader.encoding_level = value.substr(17,1);
+ this.leader.descriptive_cataloging_form = value.substr(18,1);
+ this.leader.linked_record_requirement = value.substr(19,1);
+ this.leader.entry_map = '4500';
+
+ this.directory = '';
+ this.directory_terminator = this.field_terminator;
+ this.variable_fields = new Array();
+ }
+ else if (tag > '008' && tag < '899') { // jumps low and high tags
+ if (tag != '040') this.add_field(tag,ind1,ind2,value);
+ }
+ }
+ this.add_field_005();
}
this.update_record_length();
@@ -310,7 +347,7 @@ Scholar.Ingester.MARC_Record.prototype.exists = function(tag) { // field existen
return false;
}
-function MARC_field(rec,tag,ind1,ind2,value) { // new MARC gield
+Scholar.Ingester.MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC gield
this.tag = tag;
this.occ = rec.count_occ(tag)+1; // occurrence order no.
this.ind1 = ind1; if (this.ind1 == '') this.ind1 = ' ';
@@ -428,7 +465,7 @@ Scholar.Ingester.MARC_Record.prototype.get_field_subfields = function(tag) { //
Scholar.Ingester.MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record
if (tag.length != 3) { return false; }
- var F = new MARC_field(this,tag,ind1,ind2,value);
+ var F = new this.MARC_field(this,tag,ind1,ind2,value);
// adds pointer to list of fields
this.variable_fields[this.variable_fields.length] = F;
// adds the entry to the directory
diff --git a/scrapers.sql b/scrapers.sql
@@ -0,0 +1,1014 @@
+BEGIN TRANSACTION;
+DELETE FROM scrapers;
+INSERT INTO "scrapers" VALUES(1, NULL, NULL, 20060603002000, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+var cleanString = function(s) {
+ s = utilities.trimString(s);
+ return s.replace(/ +/g, " ");
+}
+
+var uri = doc.location.href;
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
+
+// Retrieve authors
+var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+
+ model.addStatement(uri, prefixDC + ''creator'', cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here
+}
+
+// Retrieve data from "Product Details" box
+var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ var attribute = cleanString(getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue);
+ if(getNode(doc, elmt, ''./text()[1]'', nsResolver)) {
+ var value = cleanString(getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue);
+
+ if(attribute == "Publisher:") {
+ if(value.lastIndexOf("(") != -1) {
+ var date = value.substring(value.lastIndexOf("(")+1, value.length-1);
+ value = value.substring(0, value.lastIndexOf("(")-1);
+ }
+ if(value.lastIndexOf(";") != -1) {
+ var edition = value.substring(value.lastIndexOf(";")+2, value.length);
+ value = value.substring(0, value.lastIndexOf(";"));
+ }
+ model.addStatement(uri, prefixDC + ''publisher'', value);
+ model.addStatement(uri, prefixDC + ''date'', date);
+ model.addStatement(uri, prefixDC + ''hasVersion'', edition);
+ } else if(attribute == "Language:") {
+ model.addStatement(uri, prefixDC + ''language'', value);
+ } else if(attribute == "ISBN:") {
+ model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value);
+ } else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") {
+ model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" ")));
+ model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));
+ }
+ }
+}
+
+var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+var title = cleanString(getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue);
+if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) {
+ title = title.substring(0, title.lastIndexOf("(")-1);
+}
+model.addStatement(uri, prefixDC + ''title'', title);');
+
+INSERT INTO "scrapers" VALUES(2, NULL, NULL, 20060603002000, 'WorldCat Scraper', 'Simon Kornblith', '^http://newfirstsearch\.oclc\.org/WebZ/',
+'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') {
+ return true;
+}
+return false;',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var sessionRegexp = /(?:\?|\:)sessionid=([^?:]+)(?:\?|\:|$)/;
+var numberRegexp = /(?:\?|\:)recno=([^?:]+)(?:\?|\:|$)/;
+var resultsetRegexp = /(?:\?|\:)resultset=([^?:]+)(?:\?|\:|$)/;
+var lineRegexp = /^([\w() ]+): *(.*)$/;
+var publisherRegexp = /^(.*), (.*?),?$/;
+
+var uri = doc.location.href;
+
+var sMatch = sessionRegexp.exec(uri);
+var sessionid = sMatch[1];
+
+var nMatch = numberRegexp.exec(uri);
+if(nMatch) {
+ var number = nMatch[1];
+} else {
+ number = 1;
+}
+
+var rMatch = resultsetRegexp.exec(uri);
+if(rMatch) {
+ var resultset = rMatch[1];
+} else {
+ // It''s in an XPCNativeWrapper, so we have to do this black magic
+ resultset = doc.forms.namedItem(''main'').elements.namedItem(''resultset'').value;
+}
+
+var newUri = ''http://newfirstsearch.oclc.org/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno='' + number + '':sessionid='' + sessionid + '':entitypagenum=35:0'';
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
+
+function cleanAuthor(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
+ author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
+ // Add period for initials
+ if(author.substring(author.length-2, author.length-1) == " ") {
+ author += ".";
+ }
+ var splitNames = author.split('', '');
+ if(splitNames.length > 1) {
+ author = splitNames[1]+'' ''+splitNames[0];
+ }
+ return author;
+}
+
+utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintext'', null, function(text) {
+ var lines = text.split(''\n'');
+ for(var i=0;i<lines.length;i++) {
+ match = lineRegexp.exec(lines[i]);
+ if(match) {
+ if(match[1] == ''Title'') {
+ var title = match[2];
+ if(!lineRegexp.test(lines[i+1])) {
+ i++;
+ title += '' ''+lines[i];
+ }
+ if(title.substring(title.length-2) == " /") {
+ title = title.substring(0, title.length-2);
+ }
+ model.addStatement(uri, prefixDC + ''title'', title);
+ } else if(match[1] == ''Author(s)'') {
+ var authors = match[2].split('';'');
+ if(authors) {
+ model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(authors[0]));
+ for(var j=1; j<authors.length; j+=2) {
+ if(authors[j-1].substring(0, 1) == ''('') {
+ j++;
+ }
+ model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(authors[j]));
+ }
+ } else {
+ model.addStatement(uri, prefixDC + ''creator'', utilities.trimString(match[2]));
+ }
+ } else if(match[1] == ''Publication'') {
+ // Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it.
+ match[2] = utilities.trimString(match[2]);
+ if(match[2].substring(match[2].length-1) == '','') {
+ match[2] = match[2].substring(0, match[2].length-1);
+ }
+ model.addStatement(uri, prefixDC + ''publisher'', match[2]);
+ } else if(match[1] == ''Language'') {
+ model.addStatement(uri, prefixDC + ''language'', utilities.trimString(match[2]));
+ } else if(match[1] == ''Standard No'') {
+ var identifiers = match[2].split(/ +/);
+ var j=0;
+ while(j<(identifiers.length-1)) {
+ var type = identifiers[j].substring(0, identifiers[j].length-1);
+ var lastChar;
+ var value;
+
+ j++;
+ while(j<identifiers.length && (lastChar = identifiers[j].substring(identifiers[j].length-1)) != '':'') {
+ if(identifiers[j].substring(0, 1) != ''('') {
+ if(lastChar == '';'') {
+ value = identifiers[j].substring(0, identifiers[j].length-1);
+ } else {
+ value = identifiers[j];
+ }
+ model.addStatement(uri, prefixDC + ''identifier'', type + '' '' + value);
+ }
+ j++;
+ }
+ }
+ } else if(match[1] == ''Year'') {
+ model.addStatement(uri, prefixDC + ''year'', match[2]);
+ }
+ }
+ }
+
+ done();
+})
+wait();');
+
+INSERT INTO "scrapers" VALUES(3, NULL, NULL, 20060603002000, 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi',
+'try {
+ if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) {
+ return false;
+ }
+ var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options;
+ for(i in export_options) {
+ if(export_options[i].text == ''Latin1 MARC''
+ || export_options[i].text == ''Raw MARC''
+ || export_options[i].text == ''UTF-8''
+ || export_options[i].text == ''MARC (Unicode/UTF-8)''
+ || export_options[i].text == ''MARC (non-Unicode/MARC-8)'') {
+ return true;
+ }
+ }
+ return false;
+} catch(e) {
+ return false;
+}',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+
+var raw, unicode, latin1;
+
+var form = doc.forms.namedItem(''frm'');
+var newUri = form.action;
+var postString = '''';
+for(i in form.elements) {
+ if(form.elements[i].type == ''HIDDEN'' || form.elements[i].type == ''hidden'') {
+ postString += escape(form.elements[i].name)+''=''+escape(form.elements[i].value)+''&'';
+ }
+}
+
+var export_options = form.elements.namedItem(''RD'').options;
+for(i in export_options) {
+ if(export_options[i].text == ''Raw MARC''
+ || export_options[i].text == ''MARC (non-Unicode/MARC-8)'') {
+ raw = i;
+ } if(export_options[i].text == ''Latin1 MARC'') {
+ latin1 = i;
+ } else if(export_options[i].text == ''UTF-8''
+ || export_options[i].text == ''MARC (Unicode/UTF-8)'') {
+ unicode = i;
+ }
+}
+postString += ''RD=''+i+''&MAILADDY=&SAVE=Press+to+SAVE+or+PRINT'';
+
+model.addStatement(uri, prefixRDF + ''type'', prefixDummy + ''book'', false);
+
+// No idea why this doesn''t work as post
+utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) {
+ var record = new MARC_Record();
+ record.load(text, "binary");
+ model = utilities.importMARCRecord(record, uri, model);
+ done();
+})
+wait();');
+
+INSERT INTO "scrapers" VALUES(4, NULL, NULL, 20060603002000, 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+var cleanString = function(s) {
+ s = utilities.trimString(s);
+ return s.replace(/ +/g, " ");
+}
+
+var uri = doc.location.href;
+
+
+// If this is a view page, find the link to the citation
+var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+if(!elmts) {
+ var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]'';
+ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+}
+if(!elmts) {
+ exit;
+}
+var saveCitation = getNode(doc, elmts[0], ''.'', nsResolver).href;
+var viewSavedCitations = getNode(doc, elmts[1], ''.'', nsResolver).href;
+saveCitation = saveCitation.replace(''citationAction=remove'', ''citationAction=save'');
+
+// Parse save citation link
+var importantCitationRegexp = /userID.*$/;
+var match = importantCitationRegexp.exec(viewSavedCitations);
+var postData = match[0]+''&citationAction=removeAll&confirmRemAll=on'';
+utilities.HTTPUtilities.doPost(''http://www.jstor.org/browse'', postData, null, function() { // clear marked
+ utilities.HTTPUtilities.doGet(saveCitation, null, function() { // mark this
+ utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse/citations.txt?exportAction=Save+as+Text+File&exportFormat=cm&''+match[0], null, function(text) {
+ // get marked
+ var lines = text.split("\n");
+ var haveStarted = false;
+
+ var data = new Object();
+ data[prefixDC + "title"] = new Array();
+ data[prefixDC + "creator"] = new Array();
+ data[prefixDummy + "publication"] = new Array();
+ data[prefixDummy + "volume"] = new Array();
+ data[prefixDummy + "number"] = new Array();
+ data[prefixDummy + "series"] = new Array();
+ data[prefixDC + "date"] = new Array();
+ data[prefixDummy + "pages"] = new Array();
+ data[prefixDC + "identifier"] = new Array();
+ data[prefixDC + "publisher"] = new Array();
+
+ var stableURL;
+
+ for(i in lines) {
+ if(haveStarted) {
+ var fieldCode = lines[i].substring(0, 2);
+ var fieldContent = cleanString(lines[i].substring(5));
+
+ if(lines[i].substring(2, 5) != " : ") {
+ break;
+ }
+
+ if(fieldCode == "TI") {
+ data[prefixDC + "title"].push(fieldContent);
+ } else if(fieldCode == "AU") {
+ var authors = fieldContent.split(";");
+ for(j in authors) {
+ var author = authors[j];
+ var splitNames = author.split('', '');
+ if(splitNames) {
+ author = splitNames[1]+'' ''+splitNames[0];
+ }
+ data[prefixDC + "creator"].push(author);
+ }
+ } else if(fieldCode == "SO") {
+ data[prefixDummy + "publication"].push(fieldContent);
+ } else if(fieldCode == "VO") {
+ data[prefixDummy + "volume"].push(fieldContent);
+ } else if(fieldCode == "NO") {
+ data[prefixDummy + "number"].push(fieldContent);
+ } else if(fieldCode == "SE") {
+ data[prefixDummy + "series"].push(fieldContent);
+ } else if(fieldCode == "DA") {
+ data[prefixDC + "date"].push(fieldContent);
+ } else if(fieldCode == "PP") {
+ data[prefixDummy + "pages"].push(fieldContent);
+ } else if(fieldCode == "EI") {
+ stableURL = fieldContent;
+ } else if(fieldCode == "IN") {
+ data[prefixDC + "identifier"].push("ISSN "+fieldContent);
+ } else if(fieldCode == "PB") {
+ data[prefixDC + "publisher"].push(fieldContent);
+ }
+ }
+ if(lines[i].substring(0,3) == "<1>") {
+ haveStarted = true;
+ }
+ }
+
+ // Loop through again so that we can add with the stableURL
+ model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journal", false);
+ for(i in data) {
+ if(data[i].length) {
+ for(j in data[i]) {
+ model.addStatement(stableURL, i, data[i][j]);
+ }
+ }
+ }
+
+ done();
+ })
+ })
+});
+
+wait();');
+
+INSERT INTO "scrapers" VALUES(5, NULL, NULL, 20060603002000, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/journals/.+/.+/.+\.html', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+
+var month, year;
+
+var metaTags = doc.getElementsByTagName("meta");
+
+function associateMeta(field, rdfUri) {
+ var field = metaTags.namedItem(field);
+ if(field) {
+ model.addStatement(uri, rdfUri, field.getAttribute("content"), false);
+ }
+}
+
+associateMeta("Title", prefixDC + "title");
+associateMeta("Journal", prefixDummy + "publication");
+associateMeta("Volume", prefixDummy + "volume");
+associateMeta("Issue", prefixDummy + "number");
+
+var author = metaTags.namedItem("Author");
+if(author) {
+ var authors = author.getAttribute("content").split(" and ");
+ for(j in authors) {
+ model.addStatement(uri, prefixDC + "creator", authors[j], false);
+ }
+}
+
+var month = metaTags.namedItem("PublicationMonth");
+var year = metaTags.namedItem("PublicationYear");
+if(month && year) {
+ model.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false);
+}
+');
+
+INSERT INTO "scrapers" VALUES(6, NULL, NULL, 20060603002000, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/frameset\&FF=', NULL,
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+
+var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$'');
+var m = matchRegexp.exec(uri);
+var newUri = m[1]+''marc''+m[2];
+
+utilities.loadDocument(newUri, browser, function(newBrowser) {
+ newDoc = newBrowser.contentDocument;
+
+ var namespace = newDoc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+ }
+
+ var xpath = ''//pre'';
+ var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
+
+ var text = getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue;
+
+ var record = new MARC_Record();
+ record.load(text, "MARC_PAC");
+ model = utilities.importMARCRecord(record, uri, model);
+ done();
+}, function() {})
+
+wait();');
+
+INSERT INTO "scrapers" VALUES(7, NULL, NULL, 20060603002000, 'SIRSI Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
+'var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+if(elmts.length) {
+ return true;
+}
+return false;',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+function stringTrimmer(x) {
+ var x = x.replace(/^[^\w(]+/, "");
+ return x.replace(/[^\w)]+$/, "");
+}
+
+function getAnyNumber(x) {
+ var re = /[0-9]+/;
+ var m = re.exec(x);
+ if(m) {
+ return m[0];
+ }
+}
+
+function getISBN(x) {
+ var re = /^[0-9](?:[0-9X]+)/;
+ var m = re.exec(x);
+ if(m) {
+ return m[0];
+ }
+}
+
+function cleanAuthor(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
+ author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
+ // Add period for initials
+ if(author.substring(author.length-2, author.length-1) == " ") {
+ author += ".";
+ }
+ var splitNames = author.split('', '');
+ if(splitNames.length > 1) {
+ author = splitNames[1]+'' ''+splitNames[0];
+ }
+ return author;
+}
+
+var uri = doc.location.href;
+var data = new Object();
+
+var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ try {
+ var node = getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver);
+ if(!node) {
+ var node = getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver);
+ }
+ if(node) {
+ var field = stringTrimmer(getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue);
+ field = field.toLowerCase();
+ var value = stringTrimmer(node.nodeValue);
+ var rdfUri = null;
+ if(field == "publisher") {
+ rdfUri = prefixDC + ''publisher'';
+ } else if(field == "pub date") {
+ rdfUri = prefixDC + ''date'';
+ value = getAnyNumber(value);
+ } else if(field == "isbn") {
+ rdfUri = prefixDC + ''identifier'';
+ value = ''ISBN ''+getISBN(value);
+ } else if(field == "title") {
+ rdfUri = prefixDC + ''title'';
+ var titleParts = value.split(" / ");
+ value = titleParts[0];
+ } else if(field == "publication info") {
+ rdfUri = prefixDummy + ''place'';
+ var pubParts = value.split(" : ");
+ value = pubParts[0];
+ } else if(field == "personal author") {
+ rdfUri = prefixDC + ''creator'';
+ value = cleanAuthor(node.nodeValue);
+ } else if(field == "added author") {
+ rdfUri = prefixDC + ''contributor'';
+ value = cleanAuthor(node.nodeValue);
+ } else if(field == "corporate author") {
+ rdfUri = prefixDC + ''creator'';
+ }
+ if(rdfUri) {
+ var insert = true;
+ if(data && data[rdfUri]) {
+ for(j in data[rdfUri]) {
+ if(data[rdfUri][j] == value) {
+ insert = false;
+ break;
+ }
+ }
+ } else if(!data[rdfUri]) {
+ data[rdfUri] = new Array();
+ }
+ if(insert) {
+ data[rdfUri].push(value);
+ model.addStatement(uri, rdfUri, value, true);
+ }
+ }
+ }
+ } catch (e) {}
+
+}
+');
+
+INSERT INTO "scrapers" VALUES(8, NULL, NULL, 20060603002000, 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+function stringTrimmer(x) {
+ var x = x.replace(/^[^\w(]+/, "");
+ return x.replace(/[^\w)]+$/, "");
+}
+
+function getPageRange(x) {
+ var re = /[0-9\-]+/;
+ var m = re.exec(x);
+ if(m) {
+ return m[0];
+ }
+}
+
+function cleanAuthor(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
+ author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
+ // Add period for initials
+ if(author.substring(author.length-2, author.length-1) == " ") {
+ author += ".";
+ }
+ var splitNames = author.split('', '');
+ if(splitNames.length > 1) {
+ author = splitNames[1]+'' ''+splitNames[0];
+ }
+ return author;
+}
+
+var uri = doc.location.href;
+var data = new Object();
+
+// Title
+var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="headerBlack"]/strong//text()'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+var title = "";
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ title += elmt.nodeValue;
+}
+if(title) {
+ model.addStatement(uri, prefixDC + ''title'', title, true);
+}
+
+// Authors
+var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="textMedium"]/a/em'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+
+ // Dirty hack to fix highlighted words
+ var xpath = ''.//text()'';
+ var author = "";
+ var authorElmts = utilities.gatherElementsOnXPath(doc, elmt, xpath, nsResolver);
+ for (var j = 0; j < authorElmts.length; j++) {
+ var authorElmt = authorElmts[j];
+ author += authorElmt.nodeValue;
+ }
+ model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(author), true);
+}
+
+// Other info
+var xpath = ''/html/body/span[@class="textMedium"]/font/table/tbody/tr'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ var field = stringTrimmer(getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase();
+ if(field == "publication title") {
+ var publication = getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver);
+ if(publication.nodeValue) {
+ model.addStatement(uri, prefixDummy + ''publication'', stringTrimmer(publication.nodeValue), true);
+ }
+ var place = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+ if(place.nodeValue) {
+ model.addStatement(uri, prefixDummy + ''place'', stringTrimmer(place.nodeValue), true);
+ }
+ var date = getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver);
+ if(date.nodeValue) {
+ model.addStatement(uri, prefixDC + ''date'', stringTrimmer(date.nodeValue), true);
+ }
+ var moreInfo = getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver);
+ if(moreInfo.nodeValue) {
+ moreInfo = stringTrimmer(moreInfo.nodeValue);
+ var parts = moreInfo.split(";\xA0");
+
+ var issueRegexp = /^(\w+)\.(?: |\xA0)?(.+)$/
+ var issueInfo = parts[0].split(",\xA0");
+ for(j in issueInfo) {
+ var m = issueRegexp.exec(issueInfo[j]);
+ var info = m[1].toLowerCase();
+ if(info == "vol") {
+ model.addStatement(uri, prefixDummy + ''volume'', stringTrimmer(m[2]), true);
+ } else if(info == "iss" || info == "no") {
+ model.addStatement(uri, prefixDummy + ''number'', stringTrimmer(m[2]), true);
+ }
+ }
+ if(parts[1] && stringTrimmer(parts[1]).substring(0, 3).toLowerCase() == "pg.") {
+ var pages = getPageRange(parts[1]);
+ if(pages) {
+ model.addStatement(uri, prefixDummy + ''pages'', pages, true);
+ }
+ }
+ }
+ } else if(field == "source type") {
+ var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+ if(value.nodeValue) {
+ value = stringTrimmer(value.nodeValue).toLowerCase();
+
+ if(value == "newspaper" || value == "periodical") {
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+ } else {
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
+ }
+ }
+ } else if(field == "isbn" || field == "issn" || field == "issn/isbn") {
+ var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+ if(value) {
+ var type;
+ value = stringTrimmer(value.nodeValue);
+ if(value.length == 10 || value.length == 13) {
+ type = "ISBN";
+ } else if(value.length == 8) {
+ type = "ISSN";
+ }
+ if(type) {
+ model.addStatement(uri, prefixDC + "identifier", type+" "+value, false);
+ }
+ }
+ }
+}');
+
+INSERT INTO "scrapers" VALUES(9, NULL, NULL, 20060603002000, 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/',
+'if(doc.title.substring(0, 8) == "Article ") {
+ return true;
+}
+return false;',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+function cleanAuthor(author) {
+ author = author.replace(/^[\s\.\,\/\[\]\:]+/, '''');
+ author = author.replace(/[\s\,\/\[\]\:\.]+$/, '''');
+ // Add period for initials
+ if(author.substring(author.length-2, author.length-1) == " ") {
+ author += ".";
+ }
+ var splitNames = author.split('', '');
+ if(splitNames.length > 1) {
+ author = splitNames[1]+'' ''+splitNames[0];
+ }
+ return author;
+}
+
+var uri = doc.location.href;
+
+var xpath = ''/html/body//comment()'';
+var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ var colon = elmt.nodeValue.indexOf(":");
+ var field = elmt.nodeValue.substring(1, colon).toLowerCase();
+ var value = elmt.nodeValue.substring(colon+1, elmt.nodeValue.length-1);
+ if(field == "title") {
+ model.addStatement(uri, prefixDC + "title", value, false);
+ } else if(field == "journal") {
+ model.addStatement(uri, prefixDummy + "publication", value, false);
+ } else if(field == "pi") {
+ parts = value.split(" ");
+ var date = "";
+ var isDate = true;
+ var rdfUri;
+ for(j in parts) {
+ firstChar = parts[j].substring(0, 1);
+ rdfUri = false;
+
+ if(firstChar == "v") {
+ rdfUri = prefixDummy + "volume";
+ } else if(firstChar == "i") {
+ rdfUri = prefixDummy + "issue";
+ } else if(firstChar == "p") {
+ rdfUri = prefixDummy + "pages";
+ var pagesRegexp = /p(\w+)\((\w+)\)/;
+ var match = pagesRegexp.exec(parts[j]);
+ if(match) {
+ var finalPage = parseInt(match[1])+parseInt(match[2])
+ parts[j] = "p"+match[1]+"-"+finalPage.toString();
+ }
+ }
+
+ if(rdfUri) {
+ isDate = false;
+ if(parts[j] != "pNA") { // not a real page number
+ var content = parts[j].substring(1);
+ model.addStatement(uri, rdfUri, content, true);
+ }
+ } else if(isDate) {
+ date += " "+parts[j];
+ }
+ }
+ if(date != "") {
+ model.addStatement(uri, prefixDC + "date", date.substring(1), false);
+ }
+ } else if(field == "author") {
+ model.addStatement(uri, prefixDC + "creator", cleanAuthor(value), false);
+ }
+}
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);');
+
+INSERT INTO "scrapers" VALUES(10, NULL, NULL, 20060603002000, 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/document', NULL,
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+function clearTags(x) {
+ x = x.replace(/<br[^>]*>/gi, "\n");
+ return x.replace(/<[^>]+>/g, "");
+}
+
+var uri = doc.location.href;
+
+var citationDataDiv;
+var divs = doc.getElementsByTagName("div");
+for(i in divs) {
+ if(divs[i].className == "bodytext") {
+ citationDataDiv = divs[i];
+ break;
+ }
+}
+
+centerElements = citationDataDiv.getElementsByTagName("center");
+var elementParts = centerElements[0].innerHTML.split(/<br[^>]*>/gi);
+model.addStatement(uri, prefixDummy + "publication", elementParts[elementParts.length-1], true);
+
+var dateRegexp = /<br[^>]*><b>([A-Z][a-z]+)<\/b> ([0-9]+, [0-9]{4})/;
+var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML);
+if(m) {
+ model.addStatement(uri, prefixDC + "date", m[1]+" "+m[2], true);
+} else {
+ var elementParts = centerElements[centerElements.length-1].innerHTML.split(/<br[^>]*>/gi);
+ model.addStatement(uri, prefixDC + "date", elementParts[1], true);
+}
+
+var cutIndex = citationDataDiv.innerHTML.indexOf("<b>BODY:</b>");
+if(cutIndex < 0) {
+ cutIndex = citationDataDiv.innerHTML.indexOf("<b>TEXT:</b>");
+}
+if(cutIndex > 0) {
+ citationData = citationDataDiv.innerHTML.substring(0, cutIndex);
+} else {
+ citationData = citationDataDiv.innerHTML;
+}
+
+citationData = clearTags(citationData);
+
+var headlineRegexp = /\n(?:HEADLINE|TITLE|ARTICLE): ([^\n]+)\n/;
+var m = headlineRegexp.exec(citationData);
+if(m) {
+ model.addStatement(uri, prefixDC + "title", clearTags(m[1]), true);
+}
+
+var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/;
+var m = bylineRegexp.exec(citationData);
+if(m) {
+ utilities.debugPrint(m[1].substring(0, 3).toLowerCase());
+ if(m[1].substring(0, 3).toLowerCase() == "by ") {
+ m[1] = m[1].substring(3);
+ }
+ model.addStatement(uri, prefixDC + "creator", m[1], true);
+}
+
+var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/;
+var m = authorRegexp.exec(citationData);
+if(m) {
+ var authors = m[1].split(/, (?:and )?/);
+ for(i in authors) {
+ model.addStatement(uri, prefixDC + "creator", authors[i].replace(" *", ""), true);
+ }
+}
+
+model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
+
+utilities.debugPrint(citationData);');
+
+INSERT INTO "scrapers" VALUES(11, NULL, NULL, 20060603002000, 'Aleph Scraper', 'Simon Kornblith', 'func=full-set-set.*\&format=999', NULL,
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+
+var newUri = uri.replace("&format=999", "&format=001");
+utilities.debugPrint(newUri);
+
+function stringTrimmer(x) {
+ var x = x.replace(/^[^\w(]+/, "");
+ return x.replace(/[^\w)]+$/, "");
+}
+
+utilities.loadDocument(newUri, browser, function(newBrowser) {
+ newDoc = newBrowser.contentDocument;
+
+ var namespace = newDoc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+ }
+
+ var xpath = ''/html/body/table/tbody/tr[td[1][@class="td1"][@id="bold"]][td[2][@class="td1"]]'';
+ var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
+ var record = new MARC_Record();
+ for(var i=0; i<elmts.length; i++) {
+ var elmt = elmts[i];
+ var field = stringTrimmer(getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue);
+ var value = getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver).nodeValue;
+ var value = value.replace(/\|([a-z]) /g, record.subfield_delimiter+"$1");
+
+ if(field != "FMT" && field != "LDR") {
+ var ind1 = "";
+ var ind2 = "";
+ var code = field.substring(0, 3);
+ if(field.length > 3) {
+ var ind1 = field.charAt(3);
+ if(field.length > 4) {
+ var ind2 = field.charAt(4);
+ }
+ }
+ record.add_field(code, ind1, ind2, value);
+ }
+ }
+
+ model = utilities.importMARCRecord(record, uri, model);
+ done();
+}, function() {})
+
+wait();');
+
+
+INSERT INTO "scrapers" VALUES(12, NULL, NULL, 20060603002000, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*uri=full=[0-9]', NULL,
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+var newUri = uri+''&fullmarc=true'';
+utilities.debugPrint(newUri);
+
+function stringTrimmer(x) {
+ var x = x.replace(/^[^\w(]+/, "");
+ return x.replace(/[^\w)]+$/, "");
+}
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+var getNodeString = function(doc, contextNode, xpath, nsResolver) {
+ var elmts = utilities.gatherElementsOnXPath(doc, contextNode, xpath, nsResolver);
+ var returnVar = "";
+ for(var i=0; i<elmts.length; i++) {
+ returnVar += elmts[i].nodeValue;
+ }
+ return returnVar;
+}
+
+utilities.loadDocument(newUri, browser, function(newBrowser) {
+ newDoc = newBrowser.contentDocument;
+
+ var namespace = newDoc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+ } : null;
+
+ var xpath = ''//form/table[@class="tableBackground"]/tbody/tr/td/table[@class="tableBackground"]/tbody/tr[td[1]/a[@class="normalBlackFont1"]]'';
+ var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver);
+ var record = new MARC_Record();
+ for(var i=0; i<elmts.length; i++) {
+ var elmt = elmts[i];
+ var field = stringTrimmer(getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver).nodeValue);
+ var value = getNodeString(doc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver);
+ var value = value.replace(/\$([a-z]) /g, record.subfield_delimiter+"$1");
+
+ if(field != "FMT" && field != "LDR") {
+ var ind1 = "";
+ var ind2 = "";
+ var valRegexp = /^([0-9])([0-9])? (.*)$/;
+ var m = valRegexp.exec(value);
+ if(m) {
+ ind1 = m[1];
+ if(ind2) {
+ ind2 = m[2]
+ }
+ value = m[3];
+ }
+ record.add_field(field, ind1, ind2, value);
+ }
+ }
+
+ model = utilities.importMARCRecord(record, uri, model);
+ done();
+}, function() {})
+
+wait();');
+COMMIT;
+\ No newline at end of file