www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit c64e5c841f1e08c5ca40e9ecd617a96a29654a37
parent d65328c830f120315154e9cacd6c3eaac2264d07
Author: Simon Kornblith <simon@simonster.com>
Date:   Mon, 17 Jul 2006 04:06:58 +0000

closes #78, figure out import/export architecture
closes #100, migrate ingester to Scholar.Translate
closes #88, migrate scrapers away from RDF
closes #9, pull out LC subject heading tags
references #87, add fromArray() and toArray() methods to item objects

API changes:
all translation (import/export/web) now goes through Scholar.Translate
all Scholar-specific functions in scrapers start with "Scholar." rather than the jumbled up piggy bank un-namespaced confusion
scrapers now longer specify items through RDF (the beginning of an item.fromArray()-like function exists in Scholar.Translate.prototype._itemDone())
scrapers can be any combination of import, export, and web (type is the sum of 1/2/4 respectively)
scrapers now contain functions (doImport, doExport, doWeb) rather than loose code
scrapers can call functions in other scrapers or just call the function to translate itself
export accesses items item-by-item, rather than accepting a huge array of items
MARC functions are now in the MARC import translator, and accessed by the web translators

new features:
import now works
rudimentary RDF (unqualified dublin core only), RIS, and MARC import translators are implemented (although they are a little picky with respect to file extensions at the moment)
items appear as they are scraped
MARC import translator pulls out tags, although this seems to slow things down
no icon appears next to a the URL when Scholar hasn't detected metadata, since this seemed somewhat confusing

apologizes for the size of this diff. i figured if i was going to re-write the API, i might as well do it all at once and get everything working right.



Diffstat:
Mchrome/chromeFiles/content/scholar/fileInterface.js | 38++++++++++++++++++++++++++++++++++++++
Mchrome/chromeFiles/content/scholar/ingester/browser.js | 373++++++++++++++++++++++++++++++++++++++++++-------------------------------------
Mchrome/chromeFiles/content/scholar/xpcom/ingester.js | 523++++++++-----------------------------------------------------------------------
Dchrome/chromeFiles/content/scholar/xpcom/marc.js | 533-------------------------------------------------------------------------------
Mchrome/chromeFiles/content/scholar/xpcom/translate.js | 970++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---------------
Mchrome/chromeFiles/content/scholar/xpcom/utilities.js | 183+++++++++++++++++--------------------------------------------------------------
Mcomponents/chnmIScholarService.js | 4----
Mscrapers.sql | 4918+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
8 files changed, 3937 insertions(+), 3605 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/fileInterface.js b/chrome/chromeFiles/content/scholar/fileInterface.js @@ -1,5 +1,6 @@ Scholar_File_Interface = new function() { this.exportFile = exportFile; + this.importFile = importFile; /* * Creates Scholar.Translate instance and shows file picker for file export @@ -23,4 +24,41 @@ Scholar_File_Interface = new function() { translation.translate(); } } + + /* + * Creates Scholar.Translate instance and shows file picker for file import + */ + function importFile() { + var translation = new Scholar.Translate("import"); + var translators = translation.getTranslators(); + + const nsIFilePicker = Components.interfaces.nsIFilePicker; + var fp = Components.classes["@mozilla.org/filepicker;1"] + .createInstance(nsIFilePicker); + fp.init(window, "Import", nsIFilePicker.modeOpen); + for(var i in translators) { + fp.appendFilter(translators[i].label, "*."+translators[i].target); + } + + var rv = fp.show(); + if (rv == nsIFilePicker.returnOK || rv == nsIFilePicker.returnReplace) { + translation.setLocation(fp.file); + // get translators again, bc now we can check against the file + translators = translation.getTranslators(); + if(translators.length) { + // TODO: display a list of available translators + translation.setTranslator(translators[0]); + translation.setHandler("itemDone", _importItemDone); + translation.translate(); + } + } + } + + /* + * Saves items after they've been imported. We could have a nice little + * "items imported" indicator, too. + */ + function _importItemDone(obj, item) { + item.save(); + } } \ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/ingester/browser.js b/chrome/chromeFiles/content/scholar/ingester/browser.js @@ -25,8 +25,7 @@ Scholar_Ingester_Interface._scrapeProgress = new Array(); */ Scholar_Ingester_Interface.init = function() { Scholar_Ingester_Interface.browsers = new Array(); - Scholar_Ingester_Interface.browserDocuments = new Object(); - Scholar_Ingester_Interface.browserUris = new Array(); + Scholar_Ingester_Interface.browserData = new Object(); Scholar_Ingester_Interface._scrapePopupShowing = false; Scholar.Ingester.ProxyMonitor.init(); @@ -54,7 +53,7 @@ Scholar_Ingester_Interface.chromeLoad = function() { * When chrome unloads, delete our document objects and remove our listeners */ Scholar_Ingester_Interface.chromeUnload = function() { - delete Scholar_Ingester_Interface.browserDocuments; + delete Scholar_Ingester_Interface.browserData, Scholar_Ingester_Interface.browsers; this.tabBrowser.removeProgressListener(this); } @@ -62,30 +61,20 @@ Scholar_Ingester_Interface.chromeUnload = function() { * Scrapes a page (called when the capture icon is clicked) */ Scholar_Ingester_Interface.scrapeThisPage = function(saveLocation) { - var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser); - if(documentObject.scraper) { - var scrapeProgress = new Scholar_Ingester_Interface.Progress(window); - Scholar_Ingester_Interface._scrapeProgress.push(scrapeProgress); - documentObject.scrapePage(function(obj, returnValue) { Scholar_Ingester_Interface._finishScraping(obj, returnValue, scrapeProgress, saveLocation) }); - } -} - -/* - * Updates the status of the capture icon to reflect the scrapability or lack - * thereof of the current page - */ -Scholar_Ingester_Interface.updateStatus = function() { - var documentObject = Scholar_Ingester_Interface._getDocument(Scholar_Ingester_Interface.tabBrowser.selectedBrowser); - if(documentObject && documentObject.scraper) { - if(documentObject.type == "multiple") { - // Use folder icon for multiple types, for now - Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treesource-collection.png"; - } else { - Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treeitem-"+documentObject.type+".png"; - } - Scholar_Ingester_Interface.statusImage.hidden = false; - } else { - Scholar_Ingester_Interface.statusImage.hidden = true; + var browser = Scholar_Ingester_Interface.tabBrowser.selectedBrowser; + var data = Scholar_Ingester_Interface._getData(browser); + + if(data.translators && data.translators.length) { + Scholar_Ingester_Interface.Progress.show(); + + var translate = new Scholar.Translate("web"); + translate.setBrowser(browser); + // use first translator available + translate.setTranslator(data.translators[0]); + translate.setHandler("select", Scholar_Ingester_Interface._selectItems); + translate.setHandler("itemDone", Scholar_Ingester_Interface._itemDone); + translate.setHandler("done", Scholar_Ingester_Interface._finishScraping); + translate.translate(); } } @@ -122,8 +111,14 @@ Scholar_Ingester_Interface.contentLoad = function(event) { return; } - Scholar_Ingester_Interface._setDocument(browser); - Scholar_Ingester_Interface.updateStatus(); + // get data object + var data = Scholar_Ingester_Interface._getData(browser); + // get translators + var translate = new Scholar.Translate("web"); + translate.setBrowser(browser); + data.translators = translate.getTranslators(); + // update status + Scholar_Ingester_Interface._updateStatus(data); } } @@ -162,13 +157,12 @@ Scholar_Ingester_Interface.Listener.onLocationChange = function(progressObject) Scholar_Ingester_Interface._deleteDocument(browser); } } - Scholar_Ingester_Interface.updateStatus(); + + var data = Scholar_Ingester_Interface._getData(Scholar_Ingester_Interface.tabBrowser.selectedBrowser); + Scholar_Ingester_Interface._updateStatus(data); // Make sure scrape progress is gone - var scrapeProgress; - while(scrapeProgress = Scholar_Ingester_Interface._scrapeProgress.pop()) { - scrapeProgress.kill(); - } + Scholar_Ingester_Interface.Progress.kill(); } Scholar_Ingester_Interface.hidePopup = function(collectionID) { @@ -219,53 +213,41 @@ Scholar_Ingester_Interface.showPopup = function(collectionID, parentElement) { ////////////////////////////////////////////////////////////////////////////// /* - * Gets a document object given a browser window object + * Gets a data object given a browser window object * * NOTE: Browser objects are associated with document objects via keys generated * from the time the browser object is opened. I'm not sure if this is the * appropriate mechanism for handling this, but it's what PiggyBank used and it * appears to work. + * + * Currently, the data object contains only one property: "translators," which + * is an array of translators that should work with the given page as returned + * from Scholar.Translate.getTranslator() */ -Scholar_Ingester_Interface._getDocument = function(browser) { +Scholar_Ingester_Interface._getData = function(browser) { try { var key = browser.getAttribute("scholar-key"); - if(Scholar_Ingester_Interface.browserDocuments[key]) { - return Scholar_Ingester_Interface.browserDocuments[key]; + if(Scholar_Ingester_Interface.browserData[key]) { + return Scholar_Ingester_Interface.browserData[key]; } - } finally {} - return false; -} - -/* - * Creates a new document object for a browser window object, attempts to - * retrieve appropriate scraper - */ -Scholar_Ingester_Interface._setDocument = function(browser) { - try { - var key = browser.getAttribute("scholar-key"); } finally { if(!key) { var key = (new Date()).getTime(); browser.setAttribute("scholar-key", key); + Scholar_Ingester_Interface.browserData[key] = new Array(); + return Scholar_Ingester_Interface.browserData[key]; } } - - // Only re-load the scraper if it's a new document - //if(Scholar_Ingester_Interface.browserUris[key] != browser.contentDocument.location.href) { - Scholar_Ingester_Interface.browserUris[key] = browser.contentDocument.location.href; - Scholar_Ingester_Interface.browserDocuments[key] = new Scholar.Ingester.Document(browser, window); - Scholar_Ingester_Interface.browserDocuments[key].retrieveScraper(); - //} } /* * Deletes the document object associated with a given browser window object */ -Scholar_Ingester_Interface._deleteDocument = function(browser) { +Scholar_Ingester_Interface._deleteData = function(browser) { try { var key = browser.getAttribute("scholar-key"); - if(Scholar_Ingester_Interface.browserDocuments[key]) { - delete Scholar_Ingester_Interface.browserDocuments[key]; + if(Scholar_Ingester_Interface.browserData[key]) { + delete Scholar_Ingester_Interface.browserData[key]; return true; } } finally {} @@ -273,41 +255,59 @@ Scholar_Ingester_Interface._deleteDocument = function(browser) { } /* - * Callback to be executed when scraping is complete + * Updates the status of the capture icon to reflect the scrapability or lack + * thereof of the current page */ -Scholar_Ingester_Interface._finishScraping = function(obj, returnValue, scrapeProgress, saveLocation) { - if(obj.items.length) { - scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeComplete")); - - // Display title and creators - var labels = new Array(); - var icons = new Array(); - for(var i in obj.items) { - labels.push(obj.items[i].getField("title")); - icons.push("chrome://scholar/skin/treeitem-"+Scholar.ItemTypes.getName(obj.items[i].getField("itemTypeID"))+".png"); - } - scrapeProgress.addLines(labels, icons); - - // Get collection if the user used the drop-down menu - if(saveLocation) { - var saveCollection = Scholar.Collections.get(saveLocation); - } - // Save items - for(i in obj.items) { - obj.items[i].save(); - if(saveLocation) { - saveCollection.addItem(obj.items[i].getID()); - } +Scholar_Ingester_Interface._updateStatus = function(data) { + if(data.translators && data.translators.length) { + var itemType = data.translators[0].itemType; + if(itemType == "multiple") { + // Use folder icon for multiple types, for now + Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treesource-collection.png"; + } else { + Scholar_Ingester_Interface.statusImage.src = "chrome://scholar/skin/treeitem-"+itemType+".png"; } - - setTimeout(function() { scrapeProgress.fade() }, 2500); - } else if(returnValue) { - scrapeProgress.kill(); + Scholar_Ingester_Interface.statusImage.hidden = false; } else { - scrapeProgress.changeHeadline(Scholar.getString("ingester.scrapeError")); - scrapeProgress.addDescription(Scholar.getString("ingester.scrapeErrorDescription")); - setTimeout(function() { scrapeProgress.fade() }, 2500); + Scholar_Ingester_Interface.statusImage.hidden = true; + } +} + +/* + * Callback to be executed when an item has been finished + */ +Scholar_Ingester_Interface._itemDone = function(obj, item) { + var title = item.getField("title"); + var icon = "chrome://scholar/skin/treeitem-"+Scholar.ItemTypes.getName(item.getField("itemTypeID"))+".png" + Scholar_Ingester_Interface.Progress.addLines([title], [icon]); + item.save(); +} + +/* + * called when a user is supposed to select items + */ +Scholar_Ingester_Interface._selectItems = function(obj, itemList) { + // this is kinda ugly, mozillazine made me do it! honest! + var io = { dataIn:itemList, dataOut:null } + var newDialog = window.openDialog("chrome://scholar/content/ingester/selectitems.xul", + "_blank","chrome,modal,centerscreen,resizable=yes", io); + + if(!io.dataOut) { // user selected no items, so kill the progress indicatior + Scholar_Ingester_Interface.Progress.kill(); } + + return io.dataOut; +} + +/* + * Callback to be executed when scraping is complete + */ +Scholar_Ingester_Interface._finishScraping = function(obj, returnValue) { + if(!returnValue) { + Scholar_Ingester_Interface.Progress.changeHeadline(Scholar.getString("ingester.scrapeError")); + Scholar_Ingester_Interface.Progress.addDescription(Scholar.getString("ingester.scrapeErrorDescription")); + } + Scholar_Ingester_Interface.Progress.fade(); } ////////////////////////////////////////////////////////////////////////////// @@ -317,99 +317,126 @@ Scholar_Ingester_Interface._finishScraping = function(obj, returnValue, scrapePr ////////////////////////////////////////////////////////////////////////////// // Handles the display of a div showing progress in scraping - -Scholar_Ingester_Interface.Progress = function(myWindow) { - this.openerWindow = myWindow; - this.progressWindow = myWindow.openDialog("chrome://scholar/chrome/ingester/progress.xul", "", "chrome,dialog=no,titlebar=no,popup=yes"); - var me = this; - this.progressWindow.addEventListener("load", function() { me.windowLoaded() }, false); +Scholar_Ingester_Interface.Progress = new function() { + var _windowLoaded = false; + var _windowLoading = false; + // keep track of all of these things in case they're called before we're + // done loading the progress window + var _loadDescription = null; + var _loadLines = new Array(); + var _loadIcons = new Array(); + var _loadHeadline = Scholar.getString("ingester.scraping"); - this._loadDescription = null; - this._loadLines = new Array(); - this._loadIcons = new Array(); - this._loadHeadline = Scholar.getString("ingester.scraping"); -} - -Scholar_Ingester_Interface.Progress.prototype.windowLoaded = function() { - this._windowLoaded = true; - this._move(); + this.show = show; + this.changeHeadline = changeHeadline; + this.addLines = addLines; + this.addDescription = addDescription; + this.fade = fade; + this.kill = kill; - this.changeHeadline(this._loadHeadline); - this.addLines(this._loadLines, this._loadIcons); - if(this._loadDescription) { - this.addDescription(this._loadDescription); + function show() { + if(_windowLoading || _windowLoaded) { // already loading or loaded + return false; + } + _progressWindow = window.openDialog("chrome://scholar/chrome/ingester/progress.xul", "", "chrome,dialog=no,titlebar=no,popup=yes"); + _progressWindow.addEventListener("load", _onWindowLoaded, false); + _windowLoading = true; } -} - -Scholar_Ingester_Interface.Progress.prototype.changeHeadline = function(headline) { - if(this._windowLoaded) { - this.progressWindow.document.getElementById("scholar-progress-text-headline").value = headline; - } else { - this._loadHeadline = headline; + + function changeHeadline(headline) { + if(_windowLoaded) { + _progressWindow.document.getElementById("scholar-progress-text-headline").value = headline; + } else { + _loadHeadline = headline; + } } -} - -Scholar_Ingester_Interface.Progress.prototype.addLines = function(label, icon) { - if(this._windowLoaded) { - for(i in label) { - var newLabel = this.progressWindow.document.createElement("label"); - newLabel.setAttribute("class", "scholar-progress-item-label"); - newLabel.setAttribute("crop", "end"); - newLabel.setAttribute("value", label[i]); - - var newImage = this.progressWindow.document.createElement("image"); - newImage.setAttribute("class", "scholar-progress-item-icon"); - newImage.setAttribute("src", icon[i]); + + function addLines(label, icon) { + if(_windowLoaded) { + for(i in label) { + var newLabel = _progressWindow.document.createElement("label"); + newLabel.setAttribute("class", "scholar-progress-item-label"); + newLabel.setAttribute("crop", "end"); + newLabel.setAttribute("value", label[i]); + + var newImage = _progressWindow.document.createElement("image"); + newImage.setAttribute("class", "scholar-progress-item-icon"); + newImage.setAttribute("src", icon[i]); + + var newHB = _progressWindow.document.createElement("hbox"); + newHB.setAttribute("class", "scholar-progress-item-hbox"); + newHB.setAttribute("valign", "center"); + newHB.appendChild(newImage); + newHB.appendChild(newLabel); + + _progressWindow.document.getElementById("scholar-progress-text-box").appendChild(newHB); + } - var newHB = this.progressWindow.document.createElement("hbox"); + _move(); + } else { + _loadLines = _loadLines.concat(label); + _loadIcons = _loadIcons.concat(icon); + } + } + + function addDescription(text) { + if(_windowLoaded) { + var newHB = _progressWindow.document.createElement("hbox"); newHB.setAttribute("class", "scholar-progress-item-hbox"); - newHB.setAttribute("valign", "center"); - newHB.appendChild(newImage); - newHB.appendChild(newLabel); + var newDescription = _progressWindow.document.createElement("description"); + newDescription.setAttribute("class", "scholar-progress-description"); + var newText = _progressWindow.document.createTextNode(text); - this.progressWindow.document.getElementById("scholar-progress-text-box").appendChild(newHB); + newDescription.appendChild(newText); + newHB.appendChild(newDescription); + _progressWindow.document.getElementById("scholar-progress-text-box").appendChild(newHB); + + _move(); + } else { + _loadDescription = text; } - - this._move(); - } else { - this._loadLines = this._loadLines.concat(label); - this._loadIcons = this._loadIcons.concat(icon); } -} - -Scholar_Ingester_Interface.Progress.prototype.addDescription = function(text) { - if(this._windowLoaded) { - var newHB = this.progressWindow.document.createElement("hbox"); - newHB.setAttribute("class", "scholar-progress-item-hbox"); - var newDescription = this.progressWindow.document.createElement("description"); - newDescription.setAttribute("class", "scholar-progress-description"); - var newText = this.progressWindow.document.createTextNode(text); + + function fade() { + setTimeout(_timeout, 2500); + } + + function kill() { + _windowLoaded = false; + try { + _progressWindow.close(); + } catch(ex) {} + } + + function _onWindowLoaded() { + _windowLoading = false; + _windowLoaded = true; - newDescription.appendChild(newText); - newHB.appendChild(newDescription); - this.progressWindow.document.getElementById("scholar-progress-text-box").appendChild(newHB); + _move(); + // do things we delayed because the winodw was loading + changeHeadline(_loadHeadline); + addLines(_loadLines, _loadIcons); + if(_loadDescription) { + addDescription(_loadDescription); + } - this._move(); - } else { - this._loadDescription = text; + // reset parameters + _loadDescription = null; + _loadLines = new Array(); + _loadIcons = new Array(); + _loadHeadline = Scholar.getString("ingester.scraping") + } + + function _move() { + _progressWindow.sizeToContent(); + _progressWindow.moveTo( + window.screenX + window.outerWidth - _progressWindow.outerWidth - 30, + window.screenY + window.outerHeight - _progressWindow.outerHeight + ); + } + + function _timeout() { + kill(); // could check to see if we're really supposed to fade yet + // (in case multiple scrapers are operating at once) } } - -Scholar_Ingester_Interface.Progress.prototype._move = function() { - this.progressWindow.sizeToContent(); - this.progressWindow.moveTo( - this.openerWindow.screenX + this.openerWindow.outerWidth - this.progressWindow.outerWidth - 30, - this.openerWindow.screenY + this.openerWindow.outerHeight - this.progressWindow.outerHeight - ); -} - -Scholar_Ingester_Interface.Progress.prototype.fade = function() { - this.kill(); -} - -Scholar_Ingester_Interface.Progress.prototype.kill = function() { - try { - this.progressWindow.close(); - } catch(ex) {} -} - diff --git a/chrome/chromeFiles/content/scholar/xpcom/ingester.js b/chrome/chromeFiles/content/scholar/xpcom/ingester.js @@ -19,47 +19,6 @@ Scholar.Ingester.deleteHiddenBrowser = function(myBrowser) { Scholar.debug("deleted hidden browser"); } -/* - * Operates the ingester given only a URL - * url - URL to scrape - * complete - callback function to be executed if page grab completes - * (will be passed document object; obj.items contains array of - * *unsaved* items scraped; empty array indicates unscrapable page) - * error - callback function to be executed if an error occurred loading page - * myWindow - optional argument indicating window to attach a dialog to. if no - * window is given, Firefox Scholar uses the hidden DOM window and - * will simply avoid scraping multiple pages - */ -Scholar.Ingester.ingestURL = function(url, complete, error, myWindow) { - var isHidden = false; - if(!myWindow) { - var myWindow = Components.classes["@mozilla.org/appshell/appShellService;1"] - .getService(Components.interfaces.nsIAppShellService) - .hiddenDOMWindow; - var isHidden = true; - } - - var succeeded = function(browser) { - var myDoc = new Scholar.Ingester.Document(browser, myWindow, isHidden); - if(myDoc.retrieveTranslator()) { - myDoc.scrapePage(function(myDoc) { - Scholar.Ingester.deleteHiddenBrowser(browser); - complete(myDoc); - }); - } else { - Scholar.Ingester.deleteHiddenBrowser(browser); - complete(myDoc); - } - } - - var failed = function() { - Scholar.debug("Scholar.Ingester.ingestURL: could not ingest "+url); - error(); - } - - Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed, true); -} - ///////////////////////////////////////////////////////////////// // // Scholar.Ingester.ProxyMonitor @@ -101,54 +60,56 @@ Scholar.Ingester.ProxyMonitor = new function() { function observe(channel) { channel.QueryInterface(Components.interfaces.nsIHttpChannel); - if(channel.getResponseHeader("Server") == "EZproxy") { - // We're connected to an EZproxy - if(channel.responseStatus != "302") { - return; - } - - Scholar.debug(channel.URI.spec); - // We should be able to scrape the URL out of this - var m = _ezProxyRe.exec(channel.URI.spec); - if(!m) { - return; - } - - // Found URL - var variable = m[1]; - var properURL = m[2]; - if(variable.toLowerCase() == "qurl") { - properURL = unescape(properURL); - } - var properURI = _parseURL(properURL); - if(!properURI) { - return; - } - - // Get the new URL - var newURL = channel.getResponseHeader("Location"); - if(!newURL) { - return; - } - var newURI = _parseURL(newURL); - if(!newURI) { - return; - } - - if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) { - // Different ports but the same server means EZproxy active + try { + if(channel.getResponseHeader("Server") == "EZproxy") { + // We're connected to an EZproxy + if(channel.responseStatus != "302") { + return; + } + + Scholar.debug(channel.URI.spec); + // We should be able to scrape the URL out of this + var m = _ezProxyRe.exec(channel.URI.spec); + if(!m) { + return; + } + + // Found URL + var variable = m[1]; + var properURL = m[2]; + if(variable.toLowerCase() == "qurl") { + properURL = unescape(properURL); + } + var properURI = _parseURL(properURL); + if(!properURI) { + return; + } + + // Get the new URL + var newURL = channel.getResponseHeader("Location"); + if(!newURL) { + return; + } + var newURI = _parseURL(newURL); + if(!newURI) { + return; + } - Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort); - // Initialize variables here so people who never use EZProxies - // don't get the (very very minor) speed hit - if(!_mapFromProxy) { - _mapFromProxy = new Object(); - _mapToProxy = new Object(); + if(channel.URI.host == newURI.host && channel.URI.port != newURI.port) { + // Different ports but the same server means EZproxy active + + Scholar.debug("EZProxy: host "+newURI.hostPort+" is really "+properURI.hostPort); + // Initialize variables here so people who never use EZProxies + // don't get the (very very minor) speed hit + if(!_mapFromProxy) { + _mapFromProxy = new Object(); + _mapToProxy = new Object(); + } + _mapFromProxy[newURI.hostPort] = properURI.hostPort; + _mapToProxy[properURI.hostPort] = newURI.hostPort; } - _mapFromProxy[newURI.hostPort] = properURI.hostPort; - _mapToProxy[properURI.hostPort] = newURI.hostPort; } - } + } catch(e) {} } /* @@ -195,394 +156,4 @@ Scholar.Ingester.ProxyMonitor = new function() { var uri = ioService.newURI(url, null, null); return uri; } -} - -///////////////////////////////////////////////////////////////// -// -// Scholar.Ingester.Model -// -///////////////////////////////////////////////////////////////// - -// Scholar.Ingester.Model, an object representing an RDF data model with -// methods to add to that model. In Piggy Bank, this was implemented in Java, -// but seeing as we don't really want an enormous web server running with FS, -// but we don't actually need that, so it's much simpler. -// -// The Java version of this class can be viewed at -// http://simile.mit.edu/repository/piggy-bank/trunk/src/java/edu/mit/simile/piggyBank/WorkingModel.java -Scholar.Ingester.Model = function() { - this.data = new Object(); -} - -// Piggy Bank provides a fourth argument, one that determines if the third -// argument is a literal or an RDF URI. Since our ontologies are -// sufficiently restricted, we have no chance of confusing a literal and an -// RDF URI and thus this is unnecessary. -Scholar.Ingester.Model.prototype.addStatement = function(uri, rdfUri, literal) { - if(!this.data[uri]) this.data[uri] = new Object(); - if(!this.data[uri][rdfUri]) { - this.data[uri][rdfUri] = new Array(); - } - this.data[uri][rdfUri].push(literal); - Scholar.debug(rdfUri+" for "+uri+" is "+literal); -} - -// Additional functions added for compatibility purposes only -// No idea if any scraper actually uses these, but just in case, they're -// implemented so as not to throw an exception -Scholar.Ingester.Model.prototype.addTag = function() {} -Scholar.Ingester.Model.prototype.getRepository = function() {} -Scholar.Ingester.Model.prototype.detachRepository = function() {} - -////////////////////////////////////////////////////////////////////////////// -// -// Scholar.Ingester.Document -// -////////////////////////////////////////////////////////////////////////////// - -/* THIS CODE IS GOING AWAY - * eventually, all ingesting will be part of a unified API in Scholar.Translate. - * until then, Scholar.Ingester.Document reigns supreme. - * - * Public properties: - * browser - browser window object of document - * model - data model for semantic scrapers - * scraper - best scraper to use to scrape page - * items - items returned after page is scraped - * window - window, for creating new hidden browsers - * url - url, as passed through proxy system - * type - type of item that will be scraped (set after retrieveScraper() is - * called) - * - * Private properties: - * _sandbox - sandbox for code execution - * _scrapeCallback - callback function to be executed when scraping is complete - */ - -////////////////////////////////////////////////////////////////////////////// -// -// Public Scholar.Ingester.Document methods -// -////////////////////////////////////////////////////////////////////////////// - -/* - * Constructor for Document object - */ -Scholar.Ingester.Document = function(myBrowser, myWindow, isHidden) { - this.browser = myBrowser; - this.window = myWindow; - this.isHidden = isHidden; - this.scraper = this.type = null; - this.model = new Scholar.Ingester.Model(); - - // Create separate URL to account for proxies - this.url = Scholar.Ingester.ProxyMonitor.proxyToProper(this.browser.contentDocument.location.href); - if(this.url != this.browser.contentDocument.location.href) { - this.proxiedURL = true; - } - - this.items = new Array(); - this._generateSandbox(); -} - -/* - * Retrieves the best scraper to scrape a given page - */ -Scholar.Ingester.Document.prototype.retrieveScraper = function() { - Scholar.debug("Retrieving scrapers for "+this.url); - - var sql = 'SELECT * FROM translators WHERE type = 3 ORDER BY target IS NULL ASC'; - var scrapers = Scholar.DB.query(sql); - for(var i=0; i<scrapers.length; i++) { - var currentScraper = scrapers[i]; - if(this.canScrape(currentScraper)) { - this.scraper = currentScraper; - Scholar.debug("Found scraper "+this.scraper.label); - return true; - } - } - return false; -} - -/* - * Check to see if _scraper_ can scrape this document - */ -Scholar.Ingester.Document.prototype.canScrape = function(currentScraper) { - var canScrape = false; - - // Test with regular expression - // If this is slow, we could preload all scrapers and compile regular - // expressions, so each check will be faster - if(currentScraper.target) { - var regularExpression = new RegExp(currentScraper.target, "i"); - if(regularExpression.test(this.url)) { - canScrape = true; - } - } - - // Test with JavaScript if available and didn't have a regular expression or - // passed regular expression test - if((!currentScraper.target || canScrape) - && currentScraper.detectCode) { - Scholar.debug("Checking detectCode"); - var scraperSandbox = this._sandbox; - try { - canScrape = Components.utils.evalInSandbox("(function(){\n" + - currentScraper.detectCode + - "\n})()", scraperSandbox); - } catch(e) { - Scholar.debug(e+' in detectCode for '+currentScraper.label); - return false; - } - - // detectCode returns text type - if(canScrape.toString() != "") { - this.type = canScrape; - } else { - this.type = "website"; - } - } - return canScrape; -} - -/* - * Populate model with semantic data regarding this page using _scraper_ - * Callback will be executed once scraping is complete - */ -Scholar.Ingester.Document.prototype.scrapePage = function(callback) { - if(callback) { - this._scrapeCallback = callback; - } - - Scholar.debug("Scraping "+this.url); - - var scraperSandbox = this._sandbox; - try { - var returnValue = Components.utils.evalInSandbox("(function(){\n" + - this.scraper.code + - "\n})()", scraperSandbox); - } catch(e) { - Scholar.debug(e+' in code for '+this.scraper.label); - this._scrapePageComplete(false); - return; - } - - // If synchronous, call _scrapePageComplete(); - if(!this._waitForCompletion) { - Scholar.debug("is asynch"); - this._scrapePageComplete(returnValue); - } -} - -////////////////////////////////////////////////////////////////////////////// -// -// Private Scholar.Ingester.Document methods -// -////////////////////////////////////////////////////////////////////////////// - -/* - * Piggy Bank/FS offers four objects to JavaScript scrapers - * browser - the object representing the open browser window containing the - * document to be processes - * doc - the DOM (basically just browser.contentDocument) - * model - the object representing the RDF model of data to be returned - * (see Scholar.Ingester.Model) - * utilities - a set of utilities for making certain tasks easier - * (see Scholar.Utilities); - * - * Piggy Bank/FS also offers two functions to simplify asynchronous requests - * (these will only be available for scraping, and not for scrape detection) - * wait() - called on asynchronous requests so that Piggy Bank/FS will not - * automatically return at the end of code execution - * done() - when wait() is called, Piggy Bank/FS will wait for this - * function before returning - */ - -/* - * Called when scraping (synchronous or asynchronous) is complete - */ -Scholar.Ingester.Document.prototype._scrapePageComplete = function(returnValue) { - this._updateDatabase(); - if(this._scrapeCallback) { - this._scrapeCallback(this, returnValue); - } - // Get us ready for another scrape - delete this.model; - delete this.items; - this.model = new Scholar.Ingester.Model(); - this.items = new Array(); - this._waitForCompletion = false; - // This is perhaps a bit paranoid, but we need to get the model redone anyway - this._generateSandbox(); -} - -/* - * Generates a sandbox for scraping/scraper detection - */ -Scholar.Ingester.Document.prototype._generateSandbox = function() { - this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href); - this._sandbox.browser = this.browser; - this._sandbox.doc = this.browser.contentDocument; - this._sandbox.url = this.url; - this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL, this.isHidden); - this._sandbox.utilities.HTTPUtilities = new Scholar.Utilities.Ingester.HTTPUtilities(this.proxiedURL); - this._sandbox.window = this.window; - this._sandbox.model = this.model; - this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult; - this._sandbox.MARC_Record = Scholar.Ingester.MARC_Record; - this._sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record(); - - var me = this; - this._sandbox.wait = function(){ me._waitForCompletion = true; }; - this._sandbox.done = function(){ me._scrapePageComplete(); }; -} - -Scholar.Ingester.Document.prototype._associateRDF = function(rdfUri, field, uri, item, typeID) { - var fieldID; - if(fieldID = Scholar.ItemFields.getID(field)) { - if(this.model.data[uri][rdfUri] && Scholar.ItemFields.isValidForType(fieldID, typeID)) { - item.setField(field, this.model.data[uri][rdfUri][0]); - } else { - Scholar.debug("discarded scraper " + field + " data: not valid for item type "+typeID); - } - } else { - Scholar.debug("discarded scraper " + field + " data: no field in database"); - } -} - -/* - * Add data ingested using RDF to database - * (Ontologies are hard-coded until we have a real way of dealing with them) - */ -Scholar.Ingester.Document.prototype._updateDatabase = function() { - Scholar.debug("doing updating"); - - var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'; - var prefixDC = 'http://purl.org/dc/elements/1.1/'; - var prefixDCMI = 'http://purl.org/dc/dcmitype/'; - var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/'; - - // Call number fields, in order of preference - var callNumbers = new Array("LCC", "DDC", "UDC", "NLM", "NAL", "CN"); - - try { - for(var uri in this.model.data) { - // Get typeID, defaulting to "website" - try { - var type = this.model.data[uri][prefixRDF + 'type'][0].substr(prefixDummy.length); - var typeID = Scholar.ItemTypes.getID(type); - } catch(ex) { - var typeID = Scholar.ItemTypes.getID("website") - } - - var newItem = Scholar.Items.getNewItemByType(typeID); - - // Handle source and title - newItem.setField("source", uri); - if(this.model.data[uri][prefixDC + 'title']) { - newItem.setField("title", this.model.data[uri][prefixDC + 'title'][0]); - } - - // Handle creators and contributors - var creatorIndex = 0; - if(this.model.data[uri][prefixDC + 'creator']) { - for(i in this.model.data[uri][prefixDC + 'creator']) { - var creator = this.model.data[uri][prefixDC + 'creator'][i]; - var spaceIndex = creator.lastIndexOf(" "); - var lastName = creator.substring(spaceIndex+1, creator.length); - var firstName = creator.substring(0, spaceIndex); - - newItem.setCreator(creatorIndex, firstName, lastName, 1); - creatorIndex++; - } - } - if(this.model.data[uri][prefixDC + 'contributor']) { - for(i in this.model.data[uri][prefixDC + 'contributor']) { - var creator = this.model.data[uri][prefixDC + 'contributor'][i]; - var spaceIndex = creator.lastIndexOf(" "); - var lastName = creator.substring(spaceIndex+1, creator.length); - var firstName = creator.substring(0, spaceIndex); - - newItem.setCreator(creatorIndex, firstName, lastName, 2); - creatorIndex++; - } - } - if(this.model.data[uri][prefixDummy + 'corporateCreator']) { - for(i in this.model.data[uri][prefixDummy + 'corporateCreator']) { - newItem.setCreator(creatorIndex, null, this.model.data[uri][prefixDummy + 'corporateCreator'][i], 1); - creatorIndex++; - } - } - if(this.model.data[uri][prefixDummy + 'corporateContributor']) { - for(i in this.model.data[uri][prefixDummy + 'corporateContributor']) { - newItem.setCreator(creatorIndex, null, this.model.data[uri][prefixDummy + 'corporateContributor'][i], 2); - creatorIndex++; - } - } - if(this.model.data[uri][prefixDummy + 'editor']) { - for(i in this.model.data[uri][prefixDummy + 'editor']) { - newItem.setCreator(creatorIndex, null, this.model.data[uri][prefixDummy + 'editor'][i], 3); - creatorIndex++; - } - } - - // Handle years, extracting from date if necessary - if(Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("year"), typeID)) { - if(this.model.data[uri][prefixDC + 'year']) { - newItem.setField("year", this.model.data[uri][prefixDC + 'year'][0]); - } else if(this.model.data[uri][prefixDC + 'date'] && this.model.data[uri][prefixDC + 'date'][0].length >= 4) { - var ISORe = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/ - if(ISORe.test(this.model.data[uri][prefixDC + 'date'][0])) { - newItem.setField("year", this.model.data[uri][prefixDC + 'date'][0].substr(0, 4)); - } else { - var m; - var yearRe = /[0-9]{4}$/; - if(m = yearRe.exec(this.model.data[uri][prefixDC + 'date'][0])) { - newItem.setField("year", m[0]); - } - } - } - } - - // Handle ISBNs/ISSNs/Call Numbers - if(this.model.data[uri][prefixDC + 'identifier']) { - var oldIndex = -1; - var needISSN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISSN"), typeID); - var needISBN = Scholar.ItemFields.isValidForType(Scholar.ItemFields.getID("ISBN"), typeID); - for(i in this.model.data[uri][prefixDC + 'identifier']) { - prefix = this.model.data[uri][prefixDC + 'identifier'][i].substr(0, this.model.data[uri][prefixDC + 'identifier'][i].indexOf(" ")); - if(needISSN && prefix == 'ISSN') { - newItem.setField("ISSN", this.model.data[uri][prefixDC + 'identifier'][i].substring(5)); - needISSN = false; - } - if(needISBN && prefix == 'ISBN') { - newItem.setField("ISBN", this.model.data[uri][prefixDC + 'identifier'][i].substring(5)); - needISBN = false; - } - var newIndex = Scholar.arraySearch(prefix, callNumbers); - if(newIndex && newIndex > oldIndex) { - oldIndex = newIndex; - var callNumber = this.model.data[uri][prefixDC + 'identifier'][i].substring(prefix.length+1); - } - } - if(callNumber) { - newItem.setField("callNumber", callNumber); - } - } - - this._associateRDF(prefixDummy + 'publication', "publication", uri, newItem, typeID); - this._associateRDF(prefixDummy + 'volume', "volume", uri, newItem, typeID); - this._associateRDF(prefixDummy + 'number', "number", uri, newItem, typeID); - this._associateRDF(prefixDummy + 'pages', "pages", uri, newItem, typeID); - this._associateRDF(prefixDC + 'publisher', "publisher", uri, newItem, typeID); - this._associateRDF(prefixDC + 'date', "date", uri, newItem, typeID); - this._associateRDF(prefixDC + 'hasVersion', "edition", uri, newItem, typeID); - this._associateRDF(prefixDummy + 'series', "series", uri, newItem, typeID); - this._associateRDF(prefixDummy + 'place', "place", uri, newItem, typeID); - - this.items.push(newItem); - } - } catch(ex) { - Scholar.debug('Error in Scholar.Ingester.Document._updateDatabase: '+ex); - } } \ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/xpcom/marc.js b/chrome/chromeFiles/content/scholar/xpcom/marc.js @@ -1,532 +0,0 @@ -/* -* Scholar.Ingester.MARC_Record.js -* Stefano Bargioni, Pontificia Universitˆ della Santa Croce - Biblioteca -* Trattamento di record MARC in JavaScript -* -* Original version copyright (C) 2005 Stefano Bargioni, licensed under the LGPL -* (Available at http://www.pusc.it/bib/mel/Scholar.Ingester.MARC_Record.js) -* -* This library is free software; you can redistribute it or -* modify it under the terms of the GNU Lesser General Public -* License as published by the Free Software Foundation; either -* version 2.1 of the License, or (at your option) any later version. -* -* This library is distributed in the hope that it will be useful, -* but WITHOUT ANY WARRANTY; without even the implied warranty of -* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -* Lesser General Public License for more details. -*/ - -Scholar.Ingester.MARC_Record = function() { // new MARC record - this.VERSIONE = '2.6.6b'; - this.VERSIONE_data ='2005-05-10'; - - this.leader = { - record_length:'00000', - record_status:'n', // acdnp - type_of_record:' ', - bibliographic_level:' ', - type_of_control:' ', - character_coding_scheme:' ', - indicator_count:'2', - subfield_code_length:'2', - base_address_of_data:'00000', - encoding_level:' ', - descriptive_cataloging_form:' ', - linked_record_requirement:' ', - entry_map:'4500' - }; // 24 chars - - this.field_terminator = '\x1E'; - this.record_terminator = '\x1D'; - this.subfield_delimiter = '\x1F'; - this.directory = ''; - this.directory_terminator = this.field_terminator; - this.variable_fields = new Array(); - return this; -} - -Scholar.Ingester.MARC_Record.prototype.load = function(s,f) { // loads record s passed in format f - if (f == 'binary') { - this.leader.record_length = '00000'; - this.leader.record_status = s.substr(5,1); - this.leader.type_of_record = s.substr(6,1); - this.leader.bibliographic_level = s.substr(7,1); - this.leader.type_of_control = s.substr(8,1); - this.leader.character_coding_scheme = s.substr(9,1); - this.leader.indicator_count = '2'; - this.leader.subfield_code_length = '2'; - this.leader.base_address_of_data = '00000'; - this.leader.encoding_level = s.substr(17,1); - this.leader.descriptive_cataloging_form = s.substr(18,1); - this.leader.linked_record_requirement = s.substr(19,1); - this.leader.entry_map = '4500'; - - this.directory = ''; - this.directory_terminator = this.field_terminator; - this.variable_fields = new Array(); - - // loads fields - var campi = s.split(this.field_terminator); - var k; - for (k=1; k<-1+campi.length; k++) { // the first and the last are unuseful - // the first is the header + directory, the last is the this.record_terminator - var tag = campi[0].substr(24+(k-1)*12,3); - var ind1 = ''; var ind2 = ''; var value = campi[k]; - if (tag.substr(0,2) != '00') { - ind1 = campi[k].substr(0,1); - ind2 = campi[k].substr(1,1); - value = campi[k].substr(2); - } - this.add_field(tag,ind1,ind2,value); - } - } else if (f == 'MARC_Harvard') { - var linee = s.split('\n'); - for (var i=0; i<linee.length; i++) { - linee[i] = this._trim(linee[i]); - if (linee[i] == '') continue; // jumps empty lines - // linee[i] = linee[i].replace(/\t/g,' '); - linee[i] = linee[i].replace(/ \t/g,'\t'); - linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0 - var tranche = linee[i].split('|a '); - var tag, ind1, ind2, value; - if (tranche.length == 1) { - tag = linee[i].substr(0,3); - value = linee[i].substr(4); - } - else { - tag = tranche[0].substr(0,3); - ind1 = tranche[0].substr(3,1); - ind2 = tranche[0].substr(4,1); - value = tranche[1]; - value = this._trim(value); - var replacer = this.subfield_delimiter+'$1'; - value = value.replace(/\|(.) /g,replacer); - } - if (tag == 'LDR') { - this.leader.record_length = '00000'; - this.leader.record_status = value.substr(5,1); - this.leader.type_of_record = value.substr(6,1); - this.leader.bibliographic_level = value.substr(7,1); - this.leader.type_of_control = value.substr(8,1); - this.leader.character_coding_scheme = value.substr(9,1); - this.leader.indicator_count = '2'; - this.leader.subfield_code_length = '2'; - this.leader.base_address_of_data = '00000'; - this.leader.encoding_level = value.substr(17,1); - this.leader.descriptive_cataloging_form = value.substr(18,1); - this.leader.linked_record_requirement = value.substr(19,1); - this.leader.entry_map = '4500'; - - this.directory = ''; - this.directory_terminator = this.field_terminator; - this.variable_fields = new Array(); - } - else if (tag > '008' && tag < '899') { // jumps low and high tags, also H03 and similia - if (tag != '040') this.add_field(tag,ind1,ind2,value); - } - } - this.add_field_005(); - } else if (f == 'MARC_BNI') { - var linee = s.split('\n'); - for (var i=0; i<linee.length; i++) { - linee[i] = this._trim(linee[i]); - if (linee[i] == '') continue; // jumps empty lines - linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0 - linee[i] = linee[i].replace(/\|/g,' '); - linee[i] = linee[i].replace(/_/g,' '); - linee[i] = linee[i].replace(/\$/g,this.subfield_delimiter); - var tranche = linee[i].split('\t'); - var tag = tranche[0]; - var ind1 = tranche[1].substr(0,1); - var ind2 = tranche[1].substr(1,1); - var value = this._trim(tranche[2]); - if (tag == 'LEA') { - this.leader.record_length = '00000'; - this.leader.record_status = value.substr(5,1); - this.leader.type_of_record = value.substr(6,1); - this.leader.bibliographic_level = value.substr(7,1); - this.leader.type_of_control = value.substr(8,1); - this.leader.character_coding_scheme = value.substr(9,1); - this.leader.indicator_count = '2'; - this.leader.subfield_code_length = '2'; - this.leader.base_address_of_data = '00000'; - this.leader.encoding_level = value.substr(17,1); - this.leader.descriptive_cataloging_form = value.substr(18,1); - this.leader.linked_record_requirement = value.substr(19,1); - this.leader.entry_map = '4500'; - - this.directory = ''; - this.directory_terminator = this.field_terminator; - this.variable_fields = new Array(); - } - else if (tag > '008' && tag < '899') { // jumps low and high tags - if (tag != '040') this.add_field(tag,ind1,ind2,value); - } - } - this.add_field_005(); - } else if (f == 'MARC_Loc') { // MARC copiato dal browser dal sito catalog.loc.gov - var linee = s.split('\n'); - for (var i=0; i<linee.length; i++) { - linee[i] = this._trim(linee[i]); - if (linee[i] == '') continue; // jumps empty lines - linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0 - linee[i] = linee[i].replace(/_/g,' '); - linee[i] = linee[i].replace(/\t/g,''); - var replacer = this.subfield_delimiter+'$1'; - linee[i] = linee[i].replace(/\|(.) /g,replacer); - linee[i] = linee[i].replace(/\|/g,this.subfield_delimiter); - var tag = linee[i].substr(0,3); - var ind1 = linee[i].substr(4,1); - var ind2 = linee[i].substr(5,1); - var value = linee[i].substr(7); - if (tag == '000') { - linee[i] = linee[i].replace(/ /,' '); - value = linee[i].substr(4); - this.leader.record_length = '00000'; - this.leader.record_status = value.substr(5,1); - this.leader.type_of_record = value.substr(6,1); - this.leader.bibliographic_level = value.substr(7,1); - this.leader.type_of_control = value.substr(8,1); - this.leader.character_coding_scheme = value.substr(9,1); - this.leader.indicator_count = '2'; - this.leader.subfield_code_length = '2'; - this.leader.base_address_of_data = '00000'; - this.leader.encoding_level = value.substr(17,1); - this.leader.descriptive_cataloging_form = value.substr(18,1); - this.leader.linked_record_requirement = value.substr(19,1); - this.leader.entry_map = '4500'; - - this.directory = ''; - this.directory_terminator = this.field_terminator; - this.variable_fields = new Array(); - } - else if (tag > '008' && tag < '899') { // jumps low and high tags - if (tag != '040') this.add_field(tag,ind1,ind2,value); - } - } - this.add_field_005(); - } else if (f == 'MARC_PAC') { - var linee = s.split('\n'); - for (var i=0; i<linee.length; i++) { - linee[i] = linee[i].replace(/\xA0/g,' '); // in some browsers, nbsp is copied as xA0 - linee[i] = linee[i].replace(/_/g,' '); - linee[i] = linee[i].replace(/\t/g,''); - linee[i] = this._trim(linee[i]); - if (linee[i] == '') continue; // jumps empty lines - var replacer = this.subfield_delimiter+'$1'; - linee[i] = linee[i].replace(/\|(.)/g,replacer); - linee[i] = linee[i].replace(/\|/g,this.subfield_delimiter); - var tag = linee[i].substr(0,3); - var ind1 = linee[i].substr(4,1); - var ind2 = linee[i].substr(5,1); - var value = this.subfield_delimiter+'a'+linee[i].substr(7); - if(linee[i].substr(0, 6) == "LEADER") { - value = linee[i].substr(7); - this.leader.record_length = '00000'; - this.leader.record_status = value.substr(5,1); - this.leader.type_of_record = value.substr(6,1); - this.leader.bibliographic_level = value.substr(7,1); - this.leader.type_of_control = value.substr(8,1); - this.leader.character_coding_scheme = value.substr(9,1); - this.leader.indicator_count = '2'; - this.leader.subfield_code_length = '2'; - this.leader.base_address_of_data = '00000'; - this.leader.encoding_level = value.substr(17,1); - this.leader.descriptive_cataloging_form = value.substr(18,1); - this.leader.linked_record_requirement = value.substr(19,1); - this.leader.entry_map = '4500'; - - this.directory = ''; - this.directory_terminator = this.field_terminator; - this.variable_fields = new Array(); - } - else if (tag > '008' && tag < '899') { // jumps low and high tags - if (tag != '040') this.add_field(tag,ind1,ind2,value); - } - } - this.add_field_005(); - } - - this.update_record_length(); - this.update_base_address_of_data(); - return this; -} - -Scholar.Ingester.MARC_Record.prototype.update_base_address_of_data = function() { // updates the base_address - this.leader.base_address_of_data = this._zero_fill(24+this.variable_fields.length*12+1,5); - return this.leader.base_address_of_data; -} - -Scholar.Ingester.MARC_Record.prototype.update_displacements = function() { // rebuilds the directory - var displ = 0; - this.directory = ''; - for (var i=0; i<this.variable_fields.length; i++) { - var len = this.variable_fields[i].value.length + 1 + - this.variable_fields[i].ind1.length + - this.variable_fields[i].ind2.length; - this.directory += this.variable_fields[i].tag + - this._zero_fill(len,4) + this._zero_fill(displ,5); - displ += len; - } - return true; -} -Scholar.Ingester.MARC_Record.prototype.update_record_length = function() { // updates total record length - var fields_total_length = 0; var f; - for (f=0; f<this.variable_fields.length;f++) { - fields_total_length += this.variable_fields[f].ind1.length+this.variable_fields[f].ind2.length+this.variable_fields[f].value.length + 1; - } - var rl = 24+this.directory.length+1+fields_total_length+1; - this.leader.record_length = this._zero_fill(rl,5); -} - -Scholar.Ingester.MARC_Record.prototype.sort_directory = function() { // sorts directory and array variable_fields by tag and occ - // ordinamento della directory - if (this.directory.length <= 12) { return true; } // already sorted - var directory_entries = new Array(); - var i; - for (i=0; i<this.directory.length; i=i+12) { - directory_entries[directory_entries.length] = this.directory.substr(i,12); - } - directory_entries.sort(); - this.directory = directory_entries.join(''); - // sorts array variable_fields - this.variable_fields.sort(function(a,b) { return a.tag - b.tag + a.occ - b.occ; }); - return true; -} - -Scholar.Ingester.MARC_Record.prototype.show_leader = function() { - var leader = ''; var f; - for (f in this.leader) { leader += this.leader[f]; } - return leader; -} - -Scholar.Ingester.MARC_Record.prototype.show_fields = function() { - var fields = ''; var f; - for (f=0; f<this.variable_fields.length;f++) { - fields += this.variable_fields[f].ind1 + - this.variable_fields[f].ind2 + - this.variable_fields[f].value + - this.field_terminator; - } - return fields; -} - -Scholar.Ingester.MARC_Record.prototype.show_directory = function() { - var d = ''; - for (var i = 0; i<this.directory.length; i+=12) { - d += this.directory.substr(i,3) + ' ' + - this.directory.substr(i+3,4) + ' ' + - this.directory.substr(i+7,5) + '\n'; - } - return d; -} - -Scholar.Ingester.MARC_Record.prototype.add_field_005 = function() { - var now = new Date(); - now = now.getFullYear() + - this._zero_fill(now.getMonth()+1,2) + - this._zero_fill(now.getDate(),2) + - this._zero_fill(now.getHours(),2) + - this._zero_fill(now.getMinutes(),2) + - this._zero_fill(now.getSeconds(),2) + '.0'; - this.add_field('005','','',now); - return now; -} - -Scholar.Ingester.MARC_Record.prototype.count_occ = function(tag) { // counts occ of tag - var n = 0; - for (var i=0; i<this.variable_fields.length; i++) { - if (this.variable_fields[i].tag == tag) { n++; } - } - return n; -} - -Scholar.Ingester.MARC_Record.prototype.exists = function(tag) { // field existence - if (this.count_occ(tag) > 0) return true; - return false; -} - -Scholar.Ingester.MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC gield - this.tag = tag; - this.occ = rec.count_occ(tag)+1; // occurrence order no. - this.ind1 = ind1; if (this.ind1 == '') this.ind1 = ' '; - this.ind2 = ind2; if (this.ind2 == '') this.ind2 = ' '; - if (tag.substr(0,2) == '00') { - this.ind1 = ''; this.ind2 = ''; - } - this.value = value; - return this; -} - -Scholar.Ingester.MARC_Record.prototype.display = function(type) { // displays record in format type - type = type.toLowerCase(); - if (type == 'binary') return this.show_leader() + - this.directory + - this.field_terminator + - this.show_fields() + - this.record_terminator; - if (type == 'html') { - var s = '<table class="record_table">'; - var l = R.show_leader(); - s += '<tr><td class="tag">000</td><td class="ind"></td><td class="ind"></td><td class="record_value">'+l+'</td></tr>'; - var i; - for (i=0; i<this.variable_fields.length; i++) { - var ind1 = this.variable_fields[i].ind1; if (ind1 == ' ') { ind1 = '&#160'; } - var ind2 = this.variable_fields[i].ind2; if (ind2 == ' ') { ind2 = '&#160'; } - s += '<tr>'; - s += '<td class="tag">'+this.variable_fields[i].tag+'</td>'; - s += '<td class="ind">'+ind1+'</td>'; - s += '<td class="ind">'+ind2+'</td>'; - var v = this.variable_fields[i].value; - if (this.variable_fields[i].tag == '008') v = v.replace(/ /g,'&#160;'); - s += '<td class="record_value">'+this._ddagger(v)+'</td>'; - s += '</tr>'; - } - s += '</table>'; - return s; - } - if (type == 'xml') { - s = ''; - s += '<?xml version="1.0" encoding="iso-8859-1"?><collection xmlns="http://www.loc.gov/MARC21/slim"><record>'; - s += '<leader>'+this.show_leader()+'</leader>'; - // var i; - for (i=0; i<this.variable_fields.length; i++) { - ind1 = this.variable_fields[i].ind1; if (ind1 != '') ind1 = ' ind1="'+ind1+'"'; - ind2 = this.variable_fields[i].ind2; if (ind2 != '') ind2 = ' ind2="'+ind2+'"'; - if (this.variable_fields[i].tag.substr(0,2) == '00') s += '<controlfield tag="'+this.variable_fields[i].tag+'">'+this.variable_fields[i].value+'</controlfield>'; - else { - var subfields = this.variable_fields[i].value.split(this.subfield_delimiter); - // alert(this.variable_fields[i].value+' '+subfields.length); // test - if (subfields.length == 1) subfields[1] = '?'+this.variable_fields[i].value; - var sf = ''; - for (var j=1; j<subfields.length; j++) { - sf += '<subfield code="'+subfields[j].substr(0,1)+'">'+subfields[j].substr(1)+'</subfield>'; - } - s += '<datafield tag="' + this.variable_fields[i].tag + '"' + ind1 + ind2 + '>' + sf + '</datafield>'; - } - } - s += '</record></collection>'; - return s; - } - if (type == 'xml-html') { - s = this.display('xml'); - // abbellimenti - s = s.replace(/\<leader\>/,'\n <leader>'); - s = s.replace(/\<controlfield/g,'\n <controlfield'); - s = s.replace(/\<datafield/g,'\n <datafield'); - s = s.replace(/\<collection/g,'\n<collection'); - s = s.replace(/\<record/g,'\n<record'); - s = s.replace(/\<\/datafield/g,'\n </datafield'); - s = s.replace(/\<\/collection/g,'\n</collection'); - s = s.replace(/\<\/record/g,'\n</record'); - s = s.replace(/\<subfield/g,'\n <subfield'); - s = s.replace(/\x1F/g,'%1F'); s = this._ddagger(s); - // escape chars < e > - s = s.replace(/\</g,'&lt;'); - s = s.replace(/\>/g,'&gt;'); - // colore alle keyword - s = s.replace(/(controlfield|datafield|collection|record|leader|subfield)/g,'<span class="cdfield">$1</span>'); - s = s.replace(/(tag|code|ind1|ind2)=/g,'<span class="attrib">$1=</span>'); - return s; - } - return false; -} - -Scholar.Ingester.MARC_Record.prototype.get_field = function(tag) { // returns an array of values, one for each occurrence - var v = new Array(); var i; - for (i=0; i<this.variable_fields.length; i++) { - if (this.variable_fields[i].tag == tag) { - v[v.length] = this.variable_fields[i].ind1 + - this.variable_fields[i].ind2 + - this.variable_fields[i].value; - } - } - return v; -} - -// This function added by Simon Kornblith -Scholar.Ingester.MARC_Record.prototype.get_field_subfields = function(tag) { // returns a two-dimensional array of values - var field = this.get_field(tag); - var return_me = new Array(); - for(var i in field) { - return_me[i] = new Object(); - var subfields = field[i].split(this.subfield_delimiter); - if (subfields.length == 1) { - return_me[i]['?'] = field[i]; - } else { - for (var j=1; j<subfields.length; j++) { - return_me[i][subfields[j].substr(0,1)] = subfields[j].substr(1); - } - } - } - return return_me; -} - -Scholar.Ingester.MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record - if (tag.length != 3) { return false; } - var F = new this.MARC_field(this,tag,ind1,ind2,value); - // adds pointer to list of fields - this.variable_fields[this.variable_fields.length] = F; - // adds the entry to the directory - this.directory += F.tag+this._zero_fill(F.ind1.length+F.ind2.length+F.value.length+1,4)+'00000'; - // sorts the directory - this.sort_directory(); - // updates lengths - this.update_base_address_of_data(); - this.update_displacements(); - this.update_record_length(); - return F; -} - -Scholar.Ingester.MARC_Record.prototype.delete_field = function(tag,occurrence) { - // lookup and delete the occurrence from array variable_fields - var i; - for (i=0; i<this.variable_fields.length; i++) { - if (this.variable_fields[i].tag == tag && this.variable_fields[i].occ == occurrence) break; - } - if (i==this.variable_fields.length) return false; // campo non trovato - // deletes the occ. i from array variable_fields scaling next values - var j; - for (j=i+1; j<this.variable_fields.length; j++) { - this.variable_fields[i++]=this.variable_fields[j]; - } - this.variable_fields.length--; // deletes last element - // lookup and delete the occurrence from directory (must exist; no sort is needed) - var nocc = 0; - // var i; - for (i=0; i<this.directory.length;i=i+12) { - if (this.directory.substr(i,3) == tag) nocc++; - if (occurrence == nocc) { // occ found - break; - } - } - if (i >= this.directory.length) alert('Internal error!'); - this.directory = this.directory.substr(0,i) + this.directory.substr(i+12); - // updates lengths - this.update_base_address_of_data(); - this.update_displacements(); - this.update_record_length(); - return true; -} - -Scholar.Ingester.MARC_Record.prototype._ddagger = function(s) { // display doubledagger in html code - s = s.replace(/\%1F(.)/g, "<span class=\"this._ddagger\">&#135;$1</span>"); - s = s.replace(/\x1F(.)/g, "<span class=\"this._ddagger\">&#135;$1</span>"); - return s; -} - -Scholar.Ingester.MARC_Record.prototype._trim = function(s) { // eliminates blanks from both sides - s = s.replace(/\s+$/,''); - return s.replace(/^\s+/,''); -} - -Scholar.Ingester.MARC_Record.prototype._zero_fill = function(s,l) { // left '0' padding of s, up to l (l<=15) - var t = '000000000000000'; - t = t+s; - return t.substr(t.length-l,l); -} - -Scholar.Ingester.MARC_Record.prototype.version = function() { // returns version and date - return 'MARC Editor Lite '+this.VERSIONE+' ('+this.VERSIONE_data+')'; -} -\ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/xpcom/translate.js b/chrome/chromeFiles/content/scholar/xpcom/translate.js @@ -11,15 +11,15 @@ * * type can be: * export - * import (NOT IMPLEMENTED) - * web (NOT IMPLEMENTED) + * import + * web * * a typical export process: * var translatorObj = new Scholar.Translate(); * var possibleTranslators = translatorObj.getTranslators(); * // do something involving nsIFilePicker; remember, each possibleTranslator * // object has properties translatorID, label, and targetID - * translatorObj.setFile(myNsILocalFile); + * translatorObj.setLocation(myNsILocalFile); * translatorObj.setTranslator(possibleTranslators[x]); // also accepts only an ID * translatorObj.setHandler("done", _translationDone); * translatorObj.translate(); @@ -27,15 +27,19 @@ * * PUBLIC PROPERTIES: * - * type - the text type of translator (set by constructor) - * numeric type - the numeric type of translator (set by constructor) - * location - the location of the target (set by setLocation) + * type - the text type of translator (set by constructor, should be read only) + * browser - the browser object to be used for web scraping (read-only; set + * with setBrowser) + * translator - the translator currently in use (read-only; set with + * setTranslator) + * location - the location of the target (read-only; set with setLocation) * for import/export - this is an instance of nsILocalFile - * for web - this is a browser object - * translator - the translator currently in use (set by setTranslator) + * for web - this is a URL + * path - the path to the target; for web, this is the same as location * * PRIVATE PROPERTIES: * + * _numericTypes - possible numeric types as a comma-delimited string * _handlers - handlers for various events (see setHandler) * _configOptions - options set by translator modifying behavior of * Scholar.Translate @@ -43,43 +47,62 @@ * _waitForCompletion - whether to wait for asynchronous completion, or return * immediately when script has finished executing * _sandbox - sandbox in which translators will be executed + * _streams - streams that need to be closed when execution is complete + * + * WEB-ONLY PRIVATE PROPERTIES: + * + * _locationIsProxied - whether the URL being scraped is going through + * an EZProxy */ Scholar.Translate = function(type) { this.type = type; - if(this.type == "import") { - this.numericType = 1; - } else if(this.type == "export") { - this.numericType = 2; - } else if(this.type == "web") { - this.numericType = 3; + // import = 001 = 1 + // export = 010 = 2 + // web = 100 = 4 + + // combination types determined by addition or bitwise AND + // i.e., import+export = 1+2 = 3 + if(type == "import") { + this._numericTypes = "1,3,5,7"; + } else if(type == "export") { + this._numericTypes = "2,3,6,7"; + } else if(type == "web") { + this._numericTypes = "4,5,6,7"; + } else { + throw("invalid import type"); } this._handlers = new Array(); + this._streams = new Array(); } /* - * gets all applicable translators - * - * for import, you should call this after setFile; otherwise, you'll just get - * a list of all import filters, not filters equipped to handle a specific file + * sets the browser to be used for web translation; also sets the location */ -Scholar.Translate.prototype.getTranslators = function() { - this._generateSandbox(); - - if(this.type == "export") { - var sql = 'SELECT translatorID, label, target FROM translators WHERE type = ?'; - var translators = Scholar.DB.query(sql, [this.numericType]); - return translators; - } +Scholar.Translate.prototype.setBrowser = function(browser) { + this.browser = browser; + this.setLocation(browser.contentDocument.location.href); } /* - * sets the location to operate upon (file should be an nsILocalFile object) + * sets the location to operate upon (file should be an nsILocalFile object or + * web address) */ -Scholar.Translate.prototype.setLocation = function(file) { - this.location = file; +Scholar.Translate.prototype.setLocation = function(location) { + if(this.type == "web") { + // account for proxies + this.location = Scholar.Ingester.ProxyMonitor.proxyToProper(location); + if(this.location != location) { + // figure out if this URL is being proxies + this.locationIsProxied = true; + } + this.path = this.location; + } else { + this.location = location; + this.path = location.path; + } } /* @@ -88,23 +111,16 @@ Scholar.Translate.prototype.setLocation = function(file) { * accepts either the object from getTranslators() or an ID */ Scholar.Translate.prototype.setTranslator = function(translator) { - if(typeof(translator) == "object") { + if(typeof(translator) == "object") { // passed an object and not an ID translator = translator.translatorID; } - var sql = 'SELECT * FROM translators WHERE translatorID = ? AND type = ?'; - this.translator = Scholar.DB.rowQuery(sql, [translator, this.numericType]); + var sql = "SELECT * FROM translators WHERE translatorID = ? AND type IN ("+this._numericTypes+")"; + this.translator = Scholar.DB.rowQuery(sql, [translator]); if(!this.translator) { return false; } - if(this.type == "export") { - // for export, we need to execute the translator detectCode to get - // options; for other types, this has already been done - this._executeDetectCode(this.translator); - } - - Scholar.debug("got translator "+translator); return true; } @@ -130,7 +146,7 @@ Scholar.Translate.prototype.setTranslator = function(translator) { * done * valid: all * called: when all processing is finished - * passed: return value of the processing function + * passed: returns true if successful, false if an error occurred * returns: N/A */ Scholar.Translate.prototype.setHandler = function(type, handler) { @@ -141,6 +157,55 @@ Scholar.Translate.prototype.setHandler = function(type, handler) { } /* + * gets all applicable translators + * + * for import, you should call this after setFile; otherwise, you'll just get + * a list of all import filters, not filters equipped to handle a specific file + * + * this returns a list of translator objects, of which the following fields + * are useful: + * + * translatorID - the GUID of the translator + * label - the name of the translator + * itemType - the type of item this scraper says it will scrape + */ +Scholar.Translate.prototype.getTranslators = function() { + var sql = "SELECT translatorID, label, target, detectCode FROM translators WHERE type IN ("+this._numericTypes+") ORDER BY target IS NULL"; + var translators = Scholar.DB.query(sql); + + if(!this.location) { + return translators; // no need to see which can translate, because + // we don't have a location yet (for export or + // import dialog) + } else { + // create a new sandbox + this._generateSandbox(); + + var possibleTranslators = new Array(); + Scholar.debug("searching for translators for "+this.path); + + // see which translators can translate + for(var i in translators) { + if(this._canTranslate(translators[i])) { + Scholar.debug("found translator "+translators[i].label); + + // for some reason, and i'm not quite sure what this reason is, + // we HAVE to do this to get things to work right; we can't + // just push a normal translator object from an SQL statement + var translator = {translatorID:translators[i].translatorID, + label:translators[i].label, + target:translators[i].target, + itemType:translators[i].itemType} + + possibleTranslators.push(translator); + } + } + + return possibleTranslators; + } +} + +/* * gets translator options to be displayed in a dialog * * NOT IMPLEMENTED @@ -148,28 +213,57 @@ Scholar.Translate.prototype.setHandler = function(type, handler) { Scholar.Translate.prototype.displayOptions = function() { } -/* - * does the actual translation - */ -Scholar.Translate.prototype.translate = function() { - this._complete = false; - Scholar.debug("converting using "+this.translator.label); +Scholar.Translate.prototype._loadTranslator = function() { + if(!this._sandbox) { + // create a new sandbox if none exists + this._generateSandbox(); + } + + // parse detect code for the translator + this._parseDetectCode(this.translator); + + Scholar.debug("parsing code for "+this.translator.label); try { Components.utils.evalInSandbox(this.translator.code, this._sandbox); } catch(e) { Scholar.debug(e+' in parsing code for '+this.translator.label); this._translationComplete(false); - return; + return false; + } + + return true; +} + +/* + * does the actual translation + */ +Scholar.Translate.prototype.translate = function() { + + if(!this.location) { + throw("cannot translate: no location specified"); } - if(this.type == "export") { - var returnValue = this._export(); + this._complete = false; + + if(!this._loadTranslator()) { + return; } - // If synchronous, call _translationComplete(); - if(!this._waitForCompletion) { - this._translationComplete(returnValue); + var returnValue; + if(this.type == "web") { + returnValue = this._web(); + } else if(this.type == "import") { + returnValue = this._import(); + } else if(this.type == "export") { + returnValue = this._export(); + } + if(!returnValue) { + // failure + this._translationComplete(false); + } else if(!this._waitForCompletion) { + // if synchronous, call _translationComplete(); + this._translationComplete(true); } } @@ -177,42 +271,145 @@ Scholar.Translate.prototype.translate = function() { * generates a sandbox for scraping/scraper detection */ Scholar.Translate.prototype._generateSandbox = function() { + var me = this; + if(this.type == "web") { - this._sandbox = new Components.utils.Sandbox(url); - this._sandbox.browser = this.browser; - this._sandbox.doc = this.browser.contentDocument; - this._sandbox.url = this.sandboxURL; - this._sandbox.utilities = new Scholar.Utilities.Ingester(this.window, this.proxiedURL, this.isHidden); - this._sandbox.utilities.HTTPUtilities = new Scholar.Utilities.Ingester.HTTPUtilities(this.proxiedURL); - this._sandbox.model = this.model; + // use real URL, not proxied version, to create sandbox + this._sandbox = new Components.utils.Sandbox(this.browser.contentDocument.location.href); + this._sandbox.Scholar = new Object(); + + // add ingester utilities + this._sandbox.Scholar.Utilities = new Scholar.Utilities.Ingester(this.locationIsProxied); + this._sandbox.Scholar.Utilities.HTTPUtilities = new Scholar.Utilities.Ingester.HTTPUtilities(this.locationIsProxied); + + // set up selectItems handler + this._sandbox.Scholar.selectItems = function(options) { return me._selectItems(options) }; } else { + // use null URL to create sanbox this._sandbox = new Components.utils.Sandbox(""); - this._sandbox.utilities = new Scholar.Utilities(); + this._sandbox.Scholar = new Object(); + + this._sandbox.Scholar.Utilities = new Scholar.Utilities(); + } + + if(this.type == "web" || this.type == "import") { + // add routines to add new items + this._sandbox.Scholar.Item = Scholar.Translate.ScholarItem; + // attach the function to be run when an item is + this._sandbox.Scholar.Item.prototype.complete = function() {me._itemDone(this)}; + } else if(this.type == "export") { + // add routines to retrieve items and collections + this._sandbox.Scholar.nextItem = function() { return me._exportGetItem() }; + this._sandbox.Scholar.nextCollection = function() { return me._exportGetCollection() }; } this._sandbox.XPathResult = Components.interfaces.nsIDOMXPathResult; - this._sandbox.MARC_Record = Scholar.Ingester.MARC_Record; - this._sandbox.MARC_Record.prototype = new Scholar.Ingester.MARC_Record(); + // for asynchronous operation, use wait() + // done() is implemented after wait() is called + this._sandbox.Scholar.wait = function() { me._enableAsynchronous() }; + // for adding configuration options + this._sandbox.Scholar.configure = function(option, value) {me._configure(option, value) }; + // for adding displayed options + this._sandbox.Scholar.addOption = function(option, value) {me._addOption(option, value) }; + + // for loading other translators and accessing their methods var me = this; - this._sandbox.wait = function() {me._enableAsynchronous() }; - this._sandbox.configure = function(option, value) {me._configure(option, value) }; - this._sandbox.addOption = function(option, value) {me._addOption(option, value) }; + this._sandbox.Scholar.loadTranslator = function(type, translatorID) { + var translation = new Scholar.Translate(type); + // assign same handlers as for parent, because the done handler won't + // get called anyway, and the itemDone/selectItems handlers should be + // the same + translation._handlers = me._handlers; + // set the translator + translation.setTranslator(translatorID); + // load the translator into our sandbox + translation._loadTranslator(); + // use internal io + translation._initializeInternalIO(); + return translation._sandbox; + } } /* - * executes translator detectCode, sandboxed + * Check to see if _scraper_ can scrape this document */ -Scholar.Translate.prototype._executeDetectCode = function(translator) { +Scholar.Translate.prototype._canTranslate = function(translator) { + var canTranslate = false; + + // Test location with regular expression + // If this is slow, we could preload all scrapers and compile regular + // expressions, so each check will be faster + if(translator.target) { + if(this.type == "web") { + var regularExpression = new RegExp(translator.target, "i"); + } else { + var regularExpression = new RegExp("\."+translator.target+"$", "i"); + } + + if(regularExpression.test(this.path)) { + canTranslate = true; + } + } + + // Test with JavaScript if available and didn't have a regular expression or + // passed regular expression test + if((!translator.target || canTranslate) + && translator.detectCode) { + // parse the detect code and execute + this._parseDetectCode(translator); + + if(this.type == "import") { + try { + this._importConfigureIO(); // so it can read + } catch(e) { + Scholar.debug(e+' in opening IO for '+translator.label); + return false; + } + } + + if(this._sandbox.detect) { + var returnValue; + + try { + if(this.type == "web") { + returnValue = this._sandbox.detect(this.browser.contentDocument, this.location); + } else if(this.type == "import") { + returnValue = this._sandbox.detect(); + } + } catch(e) { + Scholar.debug(e+' in executing detectCode for '+translator.label); + return false; + } + + Scholar.debug("executed detectCode for "+translator.label); + + // detectCode returns text type + if(returnValue) { + canTranslate = true; + + if(typeof(returnValue) == "string") { + translator.itemType = returnValue; + } + } else { + canTranslate = false; + } + } + } + + return canTranslate; +} +Scholar.Translate.prototype._parseDetectCode = function(translator) { this._configOptions = new Array(); this._displayOptions = new Array(); - Scholar.debug("executing detect code"); - try { - return Components.utils.evalInSandbox(translator.detectCode, this._sandbox); - } catch(e) { - Scholar.debug(e+' in executing detectCode for '+translator.label); - return; + if(translator.detectCode) { + try { + Components.utils.evalInSandbox(translator.detectCode, this._sandbox); + } catch(e) { + Scholar.debug(e+' in parsing detectCode for '+translator.label); + return; + } } } @@ -259,8 +456,22 @@ Scholar.Translate.prototype._addOption = function(option, value) { * called as wait() in translator code */ Scholar.Translate.prototype._enableAsynchronous = function() { + me = this; this._waitForCompletion = true; - this._sandbox.done = function(returnValue) { me._translationComplete(returnValue); }; + this._sandbox.Scholar.done = function() { me._translationComplete(true) }; +} + +/* + * lets user pick which items s/he wants to put in his/her library + * + * called as selectItems() in translator code + */ +Scholar.Translate.prototype._selectItems = function(options) { + if(this._handlers.select) { + return this._runHandler("select", options); + } else { // no handler defined; assume they want all of them + return options; + } } /* @@ -278,17 +489,213 @@ Scholar.Translate.prototype._translationComplete = function(returnValue) { // call handler this._runHandler("done", returnValue); + + // close open streams + this._closeStreams(); } } /* + * closes open file streams, if any exist + */ +Scholar.Translate.prototype._closeStreams = function() { + if(this._streams.length) { + for(var i in this._streams) { + var stream = this._streams[i]; + + // stream could be either an input stream or an output stream + try { + stream.QueryInterface(Components.interfaces.nsIFileInputStream); + } catch(e) { + stream.QueryInterface(Components.interfaces.nsIFileOutputStream); + } + + // encase close in try block, because it's possible it's already + // closed + try { + stream.close(); + } catch(e) { + } + } + } + delete this._streams; + this._streams = new Array(); +} + +/* + * executed when an item is done and ready to be loaded into the database + */ +Scholar.Translate.prototype._itemDone = function(item) { + // Get typeID, defaulting to "website" + var type = (item.itemType ? item.itemType : "website"); + + // makes looping through easier + delete item.itemType, item.complete; + item.itemType = item.complete = undefined; + + var typeID = Scholar.ItemTypes.getID(type); + var newItem = Scholar.Items.getNewItemByType(typeID); + + if(item.date && !item.year) { + // date can serve as a year + var dateID = Scholar.ItemFields.getID("date"); + var yearID = Scholar.ItemFields.getID("year"); + if(!Scholar.ItemFields.isValidForType(dateID, typeID) && Scholar.ItemFields.isValidForType(yearID, typeID)) { + // year is valid but date is not + var yearRe = /[0-9]{4}/; + var m = yearRe.exec(item.date); + if(m) { + item.year = m[0] + item.date = undefined; + } + } + } else if(!item.date && item.year) { + // the converse is also true + var dateID = Scholar.ItemFields.getID("date"); + var yearID = Scholar.ItemFields.getID("year"); + if(Scholar.ItemFields.isValidForType(dateID, typeID) && !Scholar.ItemFields.isValidForType(yearID, typeID)) { + // date is valid but year is not + item.date = item.year; + item.year = undefined; + } + } + + Scholar.debug(item); + + var fieldID, field; + for(var i in item) { + // loop through item fields + data = item[i]; + + if(data) { // if field has content + if(i == "creators") { // creators are a special case + for(j in data) { + newItem.setCreator(j, data[j].firstName, data[j].lastName, 1); + } + } else if(i == "title") { // skip checks for title + newItem.setField(i, data); + } else if(i == "tags") { // add tags + for(j in data) { + newItem.addTag(data[j]); + } + } else if(fieldID = Scholar.ItemFields.getID(i)) { + // if field is in db + if(Scholar.ItemFields.isValidForType(fieldID, typeID)) { + // if field is valid for this type + // add field + newItem.setField(i, data); + } else { + Scholar.debug("discarded field "+i+" for item: field not valid for type "+type); + } + } else { + Scholar.debug("discarded field "+i+" for item: field does not exist"); + } + } + } + + delete item; + + this._runHandler("itemDone", newItem); +} + +/* * calls a handler (see setHandler above) */ Scholar.Translate.prototype._runHandler = function(type, argument) { + var returnValue; if(this._handlers[type]) { for(var i in this._handlers[type]) { Scholar.debug("running handler "+i+" for "+type); - this._handlers[type][i](this, argument); + try { + returnValue = this._handlers[type][i](this, argument); + } catch(e) { + Scholar.debug(e+' in handler '+i+' for '+type); + } + } + } + return returnValue; +} + +/* + * does the actual web translation + */ +Scholar.Translate.prototype._web = function() { + try { + this._sandbox.doWeb(this.browser.contentDocument, this.location); + } catch(e) { + Scholar.debug(e+' in executing code for '+this.translator.label); + return false; + } + + return true; +} + +/* + * does the actual import translation + */ +Scholar.Translate.prototype._import = function() { + this._importConfigureIO(); + + try { + this._sandbox.doImport(); + } catch(e) { + Scholar.debug(e+' in executing code for '+this.translator.label); + return false; + } + + return true; +} + +/* + * sets up import for IO + */ +Scholar.Translate.prototype._importConfigureIO = function() { + if(this._configOptions.dataMode == "rdf") { + var IOService = Components.classes['@mozilla.org/network/io-service;1'] + .getService(Components.interfaces.nsIIOService); + var fileHandler = IOService.getProtocolHandler("file") + .QueryInterface(Components.interfaces.nsIFileProtocolHandler); + var URL = fileHandler.getURLSpecFromFile(this.location); + delete fileHandler, IOService; + + var RDFService = Components.classes['@mozilla.org/rdf/rdf-service;1'] + .getService(Components.interfaces.nsIRDFService); + var dataSource = RDFService.GetDataSourceBlocking(URL); + + // make an instance of the RDF handler + this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(dataSource); + } else { + // open file + var fStream = Components.classes["@mozilla.org/network/file-input-stream;1"] + .createInstance(Components.interfaces.nsIFileInputStream); + fStream.init(this.location, 0x01, 0664, 0); + this._streams.push(fStream); + + if(this._configOptions.dataMode == "line") { // line by line reading + var notEof = true; + var lineData = new Object(); + + fStream.QueryInterface(Components.interfaces.nsILineInputStream); + + this._sandbox.Scholar.read = function() { + if(notEof) { + notEof = fStream.readLine(lineData); + return lineData.value; + } else { + return false; + } + } + } else { // block reading + var sStream = Components.classes["@mozilla.org/scriptableinputstream;1"] + .createInstance(Components.interfaces.nsIScriptableInputStream); + sStream.init(fStream); + + this._sandbox.Scholar.read = function(amount) { + return sStream.read(amount); + } + + // attach sStream to stack of streams to close + this._streams.push(sStream); } } } @@ -300,36 +707,21 @@ Scholar.Translate.prototype._export = function() { this._exportConfigureIO(); // get items - var itemObjects = Scholar.getItems(); - var itemArrays = new Array(); - for(var i in itemObjects) { - itemArrays.push(itemObjects[i].toArray()); - } - delete itemObjects; // free memory + this._itemsLeft = Scholar.getItems(); // get collections, if requested - var collectionArrays; if(this._configOptions.getCollections) { - var collectionObjects = Scholar.getCollections(); - collectionArrays = new Array(); - for(var i in collectionObjects) { - var collection = new Object(); - collection.id = collectionObjects[i].getID(); - collection.name = collectionObjects[i].getName(); - collection.type = "collection"; - collection.children = collectionObjects[i].toArray(); - - collectionArrays.push(collection); - } - delete collectionObjects; // free memory + this._collectionsLeft = Scholar.getCollections(); } try { - return this._sandbox.translate(itemArrays, collectionArrays); + this._sandbox.doExport(); } catch(e) { Scholar.debug(e+' in executing code for '+this.translator.label); - this._translationComplete(false); + return false; } + + return true; } /* @@ -337,17 +729,13 @@ Scholar.Translate.prototype._export = function() { */ Scholar.Translate.prototype._exportConfigureIO = function() { // open file - var foStream = Components.classes["@mozilla.org/network/file-output-stream;1"] + var fStream = Components.classes["@mozilla.org/network/file-output-stream;1"] .createInstance(Components.interfaces.nsIFileOutputStream); - foStream.init(this.location, 0x02 | 0x08 | 0x20, 0664, 0); // write, create, truncate + fStream.init(this.location, 0x02 | 0x08 | 0x20, 0664, 0); // write, create, truncate + // attach to stack of streams to close at the end + this._streams.push(fStream); - if(this._configOptions.dataMode == "rdf") { - /*** INITIALIZATION ***/ - var RDFService = Components.classes['@mozilla.org/rdf/rdf-service;1'].getService(Components.interfaces.nsIRDFService); - var IOService = Components.classes['@mozilla.org/network/io-service;1'].getService(Components.interfaces.nsIIOService); - var AtomService = Components.classes["@mozilla.org/atom-service;1"].getService(Components.interfaces.nsIAtomService); - var RDFContainerUtils = Components.classes["@mozilla.org/rdf/container-utils;1"].getService(Components.interfaces.nsIRDFContainerUtils); - + if(this._configOptions.dataMode == "rdf") { // rdf io // create data source var dataSource = Components.classes["@mozilla.org/rdf/datasource;1?name=xml-datasource"]. createInstance(Components.interfaces.nsIRDFDataSource); @@ -355,91 +743,319 @@ Scholar.Translate.prototype._exportConfigureIO = function() { var serializer = Components.classes["@mozilla.org/rdf/xml-serializer;1"]. createInstance(Components.interfaces.nsIRDFXMLSerializer); serializer.init(dataSource); + serializer.QueryInterface(Components.interfaces.nsIRDFXMLSource); + + // make an instance of the RDF handler + this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(dataSource, serializer); - /*** FUNCTIONS ***/ - this._sandbox.model = new Object(); + this.setHandler("done", function() { serializer.Serialize(fStream) }); + } else { // regular io; write just writes to file + this._sandbox.Scholar.write = function(data) { fStream.write(data, data.length) }; + } +} + +/* + * gets the next item to process (called as Scholar.nextItem() from code) + */ +Scholar.Translate.prototype._exportGetItem = function() { + if(this._itemsLeft.length != 0) { + var returnItem = this._itemsLeft.shift(); + return returnItem.toArray(); + } + return false; +} + +/* + * gets the next item to collection (called as Scholar.nextCollection() from code) + */ +Scholar.Translate.prototype._exportGetCollection = function() { + if(!this._collectionsLeft) { + throw("getCollections configure option not set; cannot retrieve collection"); + } + + if(this._collectionsLeft.length != 0) { + var returnItem = this._collectionsLeft.shift(); + var collection = new Object(); + collection.id = returnItem.getID(); + collection.name = returnItem.getName(); + collection.type = "collection"; + collection.children = returnItem.toArray(); - // writes an RDF triple - this._sandbox.model.addStatement = function(about, relation, value, literal) { - Scholar.debug("pre: model.addStatement("+about+", "+relation+", "+value+", "+literal+")"); + return returnItem; + } +} + +/* + * sets up internal IO in such a way that both reading and writing are possible + * (for inter-scraper communications) + */ +Scholar.Translate.prototype._initializeInternalIO = function() { + if(this.type == "import" || this.type == "export") { + if(this._configOptions.dataMode == "rdf") { + // use an in-memory data source for internal IO + var dataSource = Components.classes["@mozilla.org/rdf/datasource;1?name=in-memory-datasource"]. + createInstance(Components.interfaces.nsIRDFDataSource); - if(!(about instanceof Components.interfaces.nsIRDFResource)) { - about = RDFService.GetResource(about); - } - if(!(value instanceof Components.interfaces.nsIRDFResource)) { - if(literal) { - value = RDFService.GetLiteral(value); - } else { - value = RDFService.GetResource(value); - } - } + // make an instance of the RDF handler + this._sandbox.Scholar.RDF = new Scholar.Translate.RDF(dataSource); + } else { + // create a storage stream + var storageStream = Components.classes["@mozilla.org/storagestream;1"]. + createInstance(Components.interfaces.nsIStorageStream); + storageStream.init(4096, 4294967295, null); // virtually no size limit - Scholar.debug("post: model.addStatement("+about+", "+relation+", "+value+", "+literal+")"); + // set up write() method + var fStream = storageStream.getOutputStream(0); + this._sandbox.Scholar.write = function(data) { fStream.write(data, data.length) }; - dataSource.Assert(about, RDFService.GetResource(relation), value, true); - } - - // creates an anonymous resource - this._sandbox.model.newResource = function() { return RDFService.GetAnonymousResource() }; - - // creates a new container - this._sandbox.model.newContainer = function(type, about) { - if(!(about instanceof Components.interfaces.nsIRDFResource)) { - about = RDFService.GetResource(about); - } - - type = type.toLowerCase(); - if(type == "bag") { - return RDFContainerUtils.MakeBag(dataSource, about); - } else if(type == "seq") { - return RDFContainerUtils.MakeSeq(dataSource, about); - } else if(type == "alt") { - return RDFContainerUtils.MakeAlt(dataSource, about); - } else { - throw "Invalid container type in model.newContainer"; - } - } - - // adds a new container (index optional) - this._sandbox.model.addContainerElement = function(about, element, index) { - if(!(about instanceof Components.interfaces.nsIRDFContainer)) { - if(!(about instanceof Components.interfaces.nsIRDFResource)) { - about = RDFService.GetResource(about); + // set up read methods + var sStream; + var me = this; + if(this._configOptions.dataMode == "line") { // line by line reading + var lastCharacter; + + this._sandbox.Scholar.read = function() { + if(!sStream) { // allocate an fStream and sStream on the fly + // otherwise with no data we get an error + sStream = Components.classes["@mozilla.org/scriptableinputstream;1"] + .createInstance(Components.interfaces.nsIScriptableInputStream); + sStream.init(fStream.newInputStream(0)); + + // attach sStream to stack of streams to close + me._streams.push(sStream); + } + + var character = sStream.read(1); + if(!character) { + return false; + } + var string = ""; + + if(lastCharacter == "\r" && character == "\n") { + // if the last read got a cr, and this first char was + // an lf, ignore the lf + character = ""; + } + + while(character != "\n" && character != "\r" && character) { + string += character; + character = sStream.read(1); + } + + lastCharacter = character; + + return string; + } + } else { // block reading + this._sandbox.Scholar.read = function(amount) { + if(!sStream) { // allocate an fStream and sStream on the fly + // otherwise with no data we get an error + sStream = Components.classes["@mozilla.org/scriptableinputstream;1"] + .createInstance(Components.interfaces.nsIScriptableInputStream); + sStream.init(fStream.newInputStream(0)); + + // attach sStream to stack of streams to close + me._streams.push(sStream); + } + + return sStream.read(amount); } - var container = Components.classes["@mozilla.org/rdf/container;1"]. - createInstance(Components.interfaces.nsIRDFContainer); - container.Init(dataSource, about); - } - if(!(element instanceof Components.interfaces.nsIRDFResource)) { - element = RDFService.GetResource(element); } - if(index) { - about.InsertElementAt(element, index, true); - } else { - about.AppendElement(element); + // set Scholar.eof() to close the storage stream + this._sandbox.Scholar.eof = function() { + storageStream.QueryInterface(Components.interfaces.nsIOutputStream); + storageStream.close(); } } - - // sets a namespace - this._sandbox.model.addNamespace = function(prefix, uri) { - serializer.addNameSpace(AtomService.getAtom(prefix), uri); + } +} + +/* Scholar.Translate.ScholarItem: a class for generating new item from + * inside scraper code + * + * (this must be part of the prototype because it must be able to access + * methods relating to a specific instance of Scholar.Translate yet be called + * as a class) + */ + +Scholar.Translate.ScholarItem = function(itemType) { + // assign item type + this.itemType = itemType; + // generate creators array + this.creators = new Array(); + // generate notes array + this.notes = new Array(); + // generate tags array + this.tags = new Array(); +} + +/* Scholar.Translate.RDF: a class for handling RDF IO + * + * If an import/export translator specifies dataMode RDF, this is the interface, + * accessible from model.x + * + * In order to simplify things, all classes take in their resource/container + * as either the Mozilla native type or a string, but all + * return resource/containers as Mozilla native types (use model.toString to + * convert) + */ + +Scholar.Translate.RDF = function(dataSource, serializer) { + this._RDFService = Components.classes['@mozilla.org/rdf/rdf-service;1'] + .getService(Components.interfaces.nsIRDFService); + this._AtomService = Components.classes["@mozilla.org/atom-service;1"] + .getService(Components.interfaces.nsIAtomService); + this._RDFContainerUtils = Components.classes["@mozilla.org/rdf/container-utils;1"] + .getService(Components.interfaces.nsIRDFContainerUtils); + + this._dataSource = dataSource; + this._serializer = serializer; +} + +// turn an nsISimpleEnumerator into an array +Scholar.Translate.RDF.prototype._deEnumerate = function(enumerator) { + if(!(enumerator instanceof Components.interfaces.nsISimpleEnumerator)) { + return false; + } + + var resources = new Array(); + + while(enumerator.hasMoreElements()) { + var resource = enumerator.getNext(); + try { + resource.QueryInterface(Components.interfaces.nsIRDFLiteral); + resources.push(resource.Value); + } catch(e) { + resource.QueryInterface(Components.interfaces.nsIRDFResource); + resources.push(resource); + } + } + + if(resources.length) { + return resources; + } else { + return false; + } +} + +// get a resource as an nsIRDFResource, instead of a string +Scholar.Translate.RDF.prototype._getResource = function(about) { + if(!(about instanceof Components.interfaces.nsIRDFResource)) { + about = this._RDFService.GetResource(about); + } + return about; +} + +// USED FOR OUTPUT + +// writes an RDF triple +Scholar.Translate.RDF.prototype.addStatement = function(about, relation, value, literal) { + about = this._getResource(about); + + if(!(value instanceof Components.interfaces.nsIRDFResource)) { + if(literal) { + value = this._RDFService.GetLiteral(value); + } else { + value = this._RDFService.GetResource(value); } + } + + this._dataSource.Assert(about, this._RDFService.GetResource(relation), value, true); +} - this.setHandler("done", function() { - serializer.QueryInterface(Components.interfaces.nsIRDFXMLSource); - serializer.Serialize(foStream); - - delete dataSource, RDFService, IOService, AtomService, RDFContainerUtils; - }); +// creates an anonymous resource +Scholar.Translate.RDF.prototype.newResource = function() { + return this._RDFService.GetAnonymousResource() +}; + +// creates a new container +Scholar.Translate.RDF.prototype.newContainer = function(type, about) { + about = this._getResource(about); + + type = type.toLowerCase(); + if(type == "bag") { + return this._RDFContainerUtils.MakeBag(this._dataSource, about); + } else if(type == "seq") { + return this._RDFContainerUtils.MakeSeq(this._dataSource, about); + } else if(type == "alt") { + return this._RDFContainerUtils.MakeAlt(this._dataSource, about); } else { - /*** FUNCTIONS ***/ - // write just writes to the file - this._sandbox.write = function(data) { foStream.write(data, data.length) }; + throw "Invalid container type in model.newContainer"; } +} - this.setHandler("done", function() { - foStream.close(); - delete foStream; - }); +// adds a new container element (index optional) +Scholar.Translate.RDF.prototype.addContainerElement = function(about, element, index) { + if(!(about instanceof Components.interfaces.nsIRDFContainer)) { + about = this._getResource(about); + var container = Components.classes["@mozilla.org/rdf/container;1"]. + createInstance(Components.interfaces.nsIRDFContainer); + container.Init(this._dataSource, about); + } + if(!(element instanceof Components.interfaces.nsIRDFResource)) { + element = this._RDFService.GetResource(element); + } + + if(index) { + about.InsertElementAt(element, index, true); + } else { + about.AppendElement(element); + } +} + +// sets a namespace +Scholar.Translate.RDF.prototype.addNamespace = function(prefix, uri) { + if(this._serializer) { // silently fail, in case the reason the scraper + // is failing is that we're using internal IO + this._serializer.addNameSpace(this._AtomService.getAtom(prefix), uri); + } +} + +// gets a resource's URI +Scholar.Translate.RDF.prototype.getResourceURI = function(resource) { + resource.QueryInterface(Components.interfaces.nsIRDFResource); + return resource.ValueUTF8; +} + +// USED FOR INPUT + +// gets all RDF resources +Scholar.Translate.RDF.prototype.getAllResources = function() { + var resourceEnumerator = this._dataSource.GetAllResources(); + return this._deEnumerate(resourceEnumerator); +} + +// gets arcs going in +Scholar.Translate.RDF.prototype.getArcsIn = function(resource) { + resource = this._getResource(resource); + + var arcEnumerator = this._dataSource.ArcLabelsIn(resource); + return this._deEnumerate(arcEnumerator); +} + +// gets arcs going out +Scholar.Translate.RDF.prototype.getArcsOut = function(resource) { + resource = this._getResource(resource); + + var arcEnumerator = this._dataSource.ArcLabelsOut(resource); + return this._deEnumerate(arcEnumerator); +} + +// gets source resources +Scholar.Translate.RDF.prototype.getSources = function(resource, property) { + property = this._getResource(property); + resource = this._getResource(resource); + + var enumerator = this._dataSource.GetSources(resource, property, true); + return this._deEnumerate(enumerator); +} + +// gets target resources +Scholar.Translate.RDF.prototype.getTargets = function(resource, property) { + property = this._getResource(property); + resource = this._getResource(resource); + + var enumerator = this._dataSource.GetTargets(resource, property, true); + return this._deEnumerate(enumerator); } \ No newline at end of file diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -82,19 +82,29 @@ Scholar.Utilities.prototype.dateToISO = function(jsDate) { /* * Cleans extraneous punctuation off an author name */ -Scholar.Utilities.prototype.cleanAuthor = function(author) { +Scholar.Utilities.prototype.cleanAuthor = function(author, type, useComma) { author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); author = author.replace(/[\s\,\/\[\]\:\.]+$/, ''); author = author.replace(/ +/, ' '); - // Add period for initials - if(author.substring(author.length-2, author.length-1) == " ") { - author += "."; - } - var splitNames = author.split(', '); - if(splitNames.length > 1) { - author = splitNames[1]+' '+splitNames[0]; + if(useComma) { + // Add period for initials + if(author.substr(author.length-2, 1) == " ") { + author += "."; + } + var splitNames = author.split(', '); + if(splitNames.length > 1) { + var lastName = splitNames[0]; + var firstName = splitNames[1]; + } else { + var lastName = author; + } + } else { + var spaceIndex = author.lastIndexOf(" "); + var lastName = author.substring(spaceIndex+1); + var firstName = author.substring(0, spaceIndex); } - return author; + // TODO: take type into account + return {firstName:firstName, lastName:lastName, creatorType:type}; } /* @@ -141,7 +151,7 @@ Scholar.Utilities.prototype.getVersion = function() { /* * Get a page range, given a user-entered set of pages */ -Scholar.Utilities.prototype._pageRangeRegexp = /^\s*([0-9]+)-([0-9]+)\s*$/ +Scholar.Utilities.prototype._pageRangeRegexp = /^\s*([0-9]+)-([0-9]+)\s*$/; Scholar.Utilities.prototype.getPageRange = function(pages) { var pageNumbers; var m = this._pageRangeRegexp.exec(pages); @@ -155,9 +165,22 @@ Scholar.Utilities.prototype.getPageRange = function(pages) { return pageNumbers; } +/* + * provide inArray function + */ Scholar.Utilities.prototype.inArray = Scholar.inArray; /* + * pads a number or other string with a given string on the left + */ +Scholar.Utilities.prototype.lpad = function(string, pad, length) { + while(string.length < length) { + string = pad + string; + } + return string; +} + +/* * END SCHOLAR FOR FIREFOX EXTENSIONS */ @@ -169,10 +192,8 @@ Scholar.Utilities.prototype.inArray = Scholar.inArray; // Scholar.Utilities.Ingester extends Scholar.Utilities, offering additional // classes relating to data extraction specifically from HTML documents. -Scholar.Utilities.Ingester = function(myWindow, proxiedURL, isHidden) { - this.window = myWindow; +Scholar.Utilities.Ingester = function(proxiedURL) { this.proxiedURL = proxiedURL; - this.isHidden = isHidden; } Scholar.Utilities.Ingester.prototype = new Scholar.Utilities(); @@ -241,21 +262,6 @@ Scholar.Utilities.Ingester.prototype.getNodeString = function(doc, contextNode, } /* - * Allows a user to select which items to scrape - */ -Scholar.Utilities.Ingester.prototype.selectItems = function(itemList) { - if(this.isHidden != true) { - // this is kinda ugly, mozillazine made me do it! honest! - var io = { dataIn:itemList, dataOut:null } - var newDialog = this.window.openDialog("chrome://scholar/content/ingester/selectitems.xul", - "_blank","chrome,modal,centerscreen,resizable=yes", io); - return io.dataOut; - } else { - return null; - } -} - -/* * Grabs items based on URLs */ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe, rejectRe) { @@ -300,129 +306,19 @@ Scholar.Utilities.Ingester.prototype.getItemArray = function(doc, inHere, urlRe, return availableItems; } -// These functions are for use by importMARCRecord. They're private, because, -// while they are useful, it's also nice if as many of our scrapers as possible -// are PiggyBank compatible, and if our scrapers used functions, that would -// break compatibility -Scholar.Utilities.Ingester.prototype._MARCCleanString = function(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); - return author.replace(/ +/, ' '); -} - -Scholar.Utilities.Ingester.prototype._MARCCleanNumber = function(author) { - author = author.replace(/^[\s\.\,\/\[\]\:]+/, ''); - author = author.replace(/[\s\.\,\/\[\]\:]+$/, ''); - var regexp = /^[^ ]*/; - var m = regexp.exec(author); - if(m) { - return m[0]; - } -} -Scholar.Utilities.Ingester.prototype._MARCPullYear = function(text) { - var pullRe = /[0-9]+/; - var m = pullRe.exec(text); - if(m) { - return m[0]; - } -} - -Scholar.Utilities.Ingester.prototype._MARCAssociateField = function(record, uri, model, fieldNo, rdfUri, execMe, prefix, part) { - if(!part) { - part = 'a'; - } - var field = record.get_field_subfields(fieldNo); - Scholar.debug('Found '+field.length+' matches for '+fieldNo+part); - if(field) { - for(i in field) { - var value; - for(var j=0; j<part.length; j++) { - var myPart = part.substr(j, 1); - if(field[i][myPart]) { - if(value) { - value += " "+field[i][myPart]; - } else { - value = field[i][myPart]; - } - } - } - if(value) { - if(execMe) { - value = execMe(value); - } - if(prefix) { - value = prefix + value; - } - model.addStatement(uri, rdfUri, value); - } - } - } - return model; -} - -// This is an extension to PiggyBank's architecture. It's here so that we don't -// need an enormous library for each scraper that wants to use MARC records -Scholar.Utilities.Ingester.prototype.importMARCRecord = function(record, uri, model) { - var prefixDC = 'http://purl.org/dc/elements/1.1/'; - var prefixDCMI = 'http://purl.org/dc/dcmitype/'; - var prefixDummy = 'http://chnm.gmu.edu/firefox-scholar/'; - var prefixRDF = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'; - - // Extract ISBNs - model = this._MARCAssociateField(record, uri, model, '020', prefixDC + 'identifier', this._MARCCleanNumber, 'ISBN '); - // Extract ISSNs - model = this._MARCAssociateField(record, uri, model, '022', prefixDC + 'identifier', this._MARCCleanNumber, 'ISSN '); - // Extract creators - model = this._MARCAssociateField(record, uri, model, '100', prefixDC + 'creator', this.cleanAuthor); - model = this._MARCAssociateField(record, uri, model, '110', prefixDummy + 'corporateCreator', this._MARCCleanString); - model = this._MARCAssociateField(record, uri, model, '111', prefixDummy + 'corporateCreator', this._MARCCleanString); - model = this._MARCAssociateField(record, uri, model, '700', prefixDC + 'contributor', this.cleanAuthor); - model = this._MARCAssociateField(record, uri, model, '710', prefixDummy + 'corporateContributor', this._MARCCleanString); - model = this._MARCAssociateField(record, uri, model, '711', prefixDummy + 'corporateContributor', this._MARCCleanString); - if(!model.data[uri] || (!model.data[uri][prefixDC + 'creator'] && !model.data[uri][prefixDC + 'contributor'] && !model.data[uri][prefixDummy + 'corporateCreator'] && !model.data[uri][prefixDummy + 'corporateContributor'])) { - // some LOC entries have no listed author, but have the author in the person subject field as the first entry - var field = record.get_field_subfields('600'); - if(field[0]) { - model.addStatement(uri, prefixDC + 'creator', this.cleanAuthor(field[0]['a'])); - } - } - // Extract title - model = this._MARCAssociateField(record, uri, model, '245', prefixDC + 'title', this._MARCCleanString, '', 'ab'); - // Extract edition - model = this._MARCAssociateField(record, uri, model, '250', prefixDC + 'hasVersion', this._MARCCleanString); - // Extract place info - model = this._MARCAssociateField(record, uri, model, '260', prefixDummy + 'place', this._MARCCleanString, '', 'a'); - // Extract publisher info - model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'publisher', this._MARCCleanString, '', 'b'); - // Extract year - model = this._MARCAssociateField(record, uri, model, '260', prefixDC + 'year', this._MARCPullYear, '', 'c'); - // Extract series - model = this._MARCAssociateField(record, uri, model, '440', prefixDummy + 'series', this._MARCCleanString); - // Extract call number - model = this._MARCAssociateField(record, uri, model, '050', prefixDC + 'identifier', this._MARCCleanString, 'LCC ', 'ab'); - model = this._MARCAssociateField(record, uri, model, '060', prefixDC + 'identifier', this._MARCCleanString, 'NLM ', 'ab'); - model = this._MARCAssociateField(record, uri, model, '070', prefixDC + 'identifier', this._MARCCleanString, 'NAL ', 'ab'); - model = this._MARCAssociateField(record, uri, model, '080', prefixDC + 'identifier', this._MARCCleanString, 'UDC ', 'ab'); - model = this._MARCAssociateField(record, uri, model, '082', prefixDC + 'identifier', this._MARCCleanString, 'DDC ', 'a'); - model = this._MARCAssociateField(record, uri, model, '084', prefixDC + 'identifier', this._MARCCleanString, 'CN ', 'ab'); - - // Set type - model = model.addStatement(uri, prefixRDF + 'type', prefixDummy + "book", true); -} - /* * END SCHOLAR FOR FIREFOX EXTENSIONS */ // Ingester adapters for Scholar.Utilities.HTTP to handle proxies -Scholar.Utilities.Ingester.prototype.loadDocument = function(url, browser, succeeded, failed) { +Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) { if(this.proxiedURL) { url = Scholar.Ingester.ProxyMonitor.properToProxy(url); } Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed); } -Scholar.Utilities.Ingester.prototype.processDocuments = function(browser, firstDoc, urls, processor, done, exception) { +Scholar.Utilities.Ingester.prototype.processDocuments = function(firstDoc, urls, processor, done, exception) { for(i in urls) { urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]); } @@ -476,6 +372,7 @@ Scholar.Utilities.HTTP = new function() { * in our code, is required for compatiblity with the Piggy Bank project **/ function doGet(url, callback1, callback2) { + Scholar.debug("HTTP GET "+url); if (this.browserIsOffline()){ return false; } @@ -508,6 +405,7 @@ Scholar.Utilities.HTTP = new function() { * in our code, is required for compatiblity with the Piggy Bank project **/ function doPost(url, body, callback1, callback2) { + Scholar.debug("HTTP POST "+body+" to "+url); if (this.browserIsOffline()){ return false; } @@ -538,6 +436,7 @@ Scholar.Utilities.HTTP = new function() { * in our code, is required for compatiblity with the Piggy Bank project **/ function doOptions(url, body, callback1, callback2) { + Scholar.debug("HTTP OPTIONS "+url); if (this.browserIsOffline()){ return false; } @@ -641,7 +540,6 @@ Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, do .hiddenDOMWindow; var hiddenBrowser = Scholar.Ingester.createHiddenBrowser(myWindow); var prevUrl, url; - Scholar.debug("processDocuments called"); try { if (urls.length == 0) { @@ -690,14 +588,11 @@ Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, do } }; var init = function() { - Scholar.debug("init called"); hiddenBrowser.addEventListener("load", onLoad, true); if (firstDoc) { - Scholar.debug("processing"); processor(firstDoc, doLoad); } else { - Scholar.debug("doing load"); doLoad(); } } diff --git a/components/chnmIScholarService.js b/components/chnmIScholarService.js @@ -45,10 +45,6 @@ Cc["@mozilla.org/moz/jssubscript-loader;1"] Cc["@mozilla.org/moz/jssubscript-loader;1"] .getService(Ci.mozIJSSubScriptLoader) .loadSubScript("chrome://scholar/content/xpcom/translate.js"); - -Cc["@mozilla.org/moz/jssubscript-loader;1"] - .getService(Ci.mozIJSSubScriptLoader) - .loadSubScript("chrome://scholar/content/xpcom/marc.js"); Cc["@mozilla.org/moz/jssubscript-loader;1"] .getService(Ci.mozIJSSubScriptLoader) diff --git a/scrapers.sql b/scrapers.sql @@ -1,487 +1,445 @@ --- 33 +-- 34 -- Set the following timestamp to the most recent scraper update date REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-07-07 12:44:00')); -REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 3, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', -'if(doc.location.href.indexOf("search") >= 0) { - return "multiple"; -} else { - return "book"; +REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 4, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)', +'function detect(doc, url) { + var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/|s/)''); + if(searchRe.test(doc.location.href)) { + return "multiple"; + } else { + return "book"; + } } ', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; +'function scrape(doc) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; -function scrape(doc) { - uri = doc.location.href; + var newItem = new Scholar.Item("book"); + newItem.source = doc.location.href; // Retrieve authors - var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - for (var i = 0; i < elmts.length; i++) { - var elmt = elmts[i]; - - model.addStatement(uri, prefixDC + ''creator'', utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue), false); // Use your own type here - } + try { + var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/a''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + var author = Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue; + + newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author")); + } + } catch(ex) {} // Retrieve data from "Product Details" box var xpath = ''/html/body/table/tbody/tr/td[2]/table/tbody/tr/td[@class="bucket"]/div[@class="content"]/ul/li''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { - var elmt = elmts[i]; - var attribute = utilities.cleanString(utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); - if(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { - var value = utilities.cleanString(utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); - if(attribute == "Publisher:") { - if(value.lastIndexOf("(") != -1) { - var date = value.substring(value.lastIndexOf("(")+1, value.length-1); - jsDate = new Date(date); - if(!isNaN(jsDate.valueOf())) { - date = utilities.dateToISO(jsDate); + try { + var elmt = elmts[i]; + var attribute = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); + if(Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { + var value = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); + if(attribute == "Publisher:") { + if(value.lastIndexOf("(") != -1) { + var date = value.substring(value.lastIndexOf("(")+1, value.length-1); + jsDate = new Date(date); + if(!isNaN(jsDate.valueOf())) { + date = Scholar.Utilities.dateToISO(jsDate); + } + newItem.date = date; + + value = value.substring(0, value.lastIndexOf("(")-1); } - - value = value.substring(0, value.lastIndexOf("(")-1); - } - if(value.lastIndexOf(";") != -1) { - var edition = value.substring(value.lastIndexOf(";")+2, value.length); - value = value.substring(0, value.lastIndexOf(";")); + if(value.lastIndexOf(";") != -1) { + newItem.edition = value.substring(value.lastIndexOf(";")+2, value.length); + + value = value.substring(0, value.lastIndexOf(";")); + } + newItem.publisher = value; + /*} else if(attribute == "Language:") { + .addStatement(uri, prefixDC + ''language'', value);*/ + } else if(attribute == "ISBN:") { + newItem.ISBN = value; + /*} else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") { + .addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" "))); + .addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":")));*/ } - model.addStatement(uri, prefixDC + ''publisher'', value); - model.addStatement(uri, prefixDC + ''date'', date); - model.addStatement(uri, prefixDC + ''hasVersion'', edition); - } else if(attribute == "Language:") { - model.addStatement(uri, prefixDC + ''language'', value); - } else if(attribute == "ISBN:") { - model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value); - } else if(value.substring(value.indexOf(" ")+1, value.length) == "pages") { - model.addStatement(uri, prefixDummy + ''pages'', value.substring(0, value.indexOf(" "))); - model.addStatement(uri, prefixDC + ''medium'', attribute.substring(0, attribute.indexOf(":"))); } - } + } catch(ex) {} } var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - var title = utilities.cleanString(utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var title = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) { title = title.substring(0, title.lastIndexOf("(")-1); } - model.addStatement(uri, prefixDC + ''title'', title); - model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); + newItem.title = title; + + newItem.complete(); } -var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/)''); -var m = searchRe.exec(doc.location.href) -if(m) { - // Why can''t amazon use the same stylesheets - var xpath; - if(m == "gp/search/") { - xpath = ''//table[@class="searchresults"]''; +function doWeb(doc, url) { + var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/|s/)''); + var m = searchRe.exec(doc.location.href) + if(m) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + // Why can''t amazon use the same stylesheets + var xpath; + if(m == "exec/obidos/search-handle-url/") { + xpath = ''//table[@cellpadding="3"]''; + } else { + xpath = ''//table[@class="searchresults"]''; + } + + var searchresults = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var items = Scholar.Utilities.getItemArray(doc, searchresults, ''^http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$''); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(var i in items) { + uris.push(i); + } + + Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + function() { Scholar.done(); }, function() {}); + + Scholar.wait(); } else { - xpath = ''//table[@cellpadding="3"]''; - } - - var searchresults = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - var items = utilities.getItemArray(doc, searchresults, ''^http://www\.amazon\.com/(gp/product/|exec/obidos/tg/detail/)'', ''^(Buy new|Hardcover|Paperback|Digital)$''); - items = utilities.selectItems(items); - - if(!items) { - return true; - } - - var uris = new Array(); - for(i in items) { - uris.push(i); + scrape(doc); } - - utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, - function() { done(); }, function() {}); - - wait(); -} else { - scrape(doc); }'); -REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 3, 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', -'if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { - return "book"; -} else if(doc.title == ''FirstSearch: WorldCat List of Records'') { - return "multiple"; -} -return false;', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var sessionRegexp = /(?:\?|\:)sessionid=([^?:]+)(?:\?|\:|$)/; -var numberRegexp = /(?:\?|\:)recno=([^?:]+)(?:\?|\:|$)/; -var resultsetRegexp = /(?:\?|\:)resultset=([^?:]+)(?:\?|\:|$)/; -var hostRegexp = new RegExp("http://([^/]+)/"); - -var uri = doc.location.href; - -var sMatch = sessionRegexp.exec(uri); -var sessionid = sMatch[1]; - -var hMatch = hostRegexp.exec(uri); -var host = hMatch[1]; - -var newUri, exportselect; - -if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { - var publisherRegexp = /^(.*), (.*?),?$/; - - var nMatch = numberRegexp.exec(uri); - if(nMatch) { - var number = nMatch[1]; - } else { - number = 1; - } - - var rMatch = resultsetRegexp.exec(uri); - if(rMatch) { - var resultset = rMatch[1]; - } else { - // It''s in an XPCNativeWrapper, so we have to do this black magic - resultset = doc.forms.namedItem(''main'').elements.namedItem(''resultset'').value; +REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 4, 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', +'function detect(doc, url) { + if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { + return "book"; + } else if(doc.title == ''FirstSearch: WorldCat List of Records'') { + return "multiple"; } +}', +'function doWeb(doc, url) { + var sessionRegexp = /(?:\?|\:)sessionid=([^?:]+)(?:\?|\:|$)/; + var numberRegexp = /(?:\?|\:)recno=([^?:]+)(?:\?|\:|$)/; + var resultsetRegexp = /(?:\?|\:)resultset=([^?:]+)(?:\?|\:|$)/; + var hostRegexp = new RegExp("http://([^/]+)/"); + + var sMatch = sessionRegexp.exec(url); + var sessionid = sMatch[1]; - exportselect = ''record''; - newUri = ''http://''+host+''/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno='' + number + '':sessionid='' + sessionid + '':entitypagenum=35:0''; - - var uris = new Array(newUri); -} else { - var items = utilities.getItemArray(doc, doc, ''/WebZ/FSFETCH\\?fetchtype=fullrecord'', ''^(See more details for locating this item|Detailed Record)$''); - items = utilities.selectItems(items); - - if(!items) { - return true; - } + var hMatch = hostRegexp.exec(url); + var host = hMatch[1]; - // Set BookMark cookie - for(i in items) { // Hack to get first item - var myCookie = sessionid+":"; - var rMatch = resultsetRegexp.exec(i); - var resultset = rMatch[1]; - break; - } - var uris = new Array(); - for(i in items) { - var nMatch = numberRegexp.exec(i); - myCookie += resultset+"_"+nMatch[1]+","; - uris.push(i); - } - myCookie = myCookie.substr(0, myCookie.length-1); - doc.cookie = "BookMark="+myCookie; + var newUri, exportselect; - exportselect = ''marked''; - newUri = ''http://''+host+''/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno=1:sessionid='' + sessionid + '':entitypagenum=29:0''; -} - -utilities.HTTPUtilities.doPost(newUri, ''exportselect=''+exportselect+''&exporttype=plaintext'', null, function(text) { - var lineRegexp = new RegExp(); - lineRegexp.compile("^([\\w() ]+): *(.*)$"); - - var k = 0; - var uri = uris[k]; - model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); - - var lines = text.split(''\n''); - for(var i=0;i<lines.length;i++) { - match = lineRegexp.exec(lines[i]); - if(lines[i] == "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------") { - k++; - if(uris[k]) { - uri = uris[k]; - model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); - } else { - break; - } - } else if(match) { - if(match[1] == ''Title'') { - var title = match[2]; - if(!lineRegexp.test(lines[i+1])) { - i++; - title += '' ''+lines[i]; - } - if(title.substring(title.length-2) == " /") { - title = title.substring(0, title.length-2); + if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { + var publisherRegexp = /^(.*), (.*?),?$/; + + var nMatch = numberRegexp.exec(url); + if(nMatch) { + var number = nMatch[1]; + } else { + number = 1; + } + + var rMatch = resultsetRegexp.exec(url); + if(rMatch) { + var resultset = rMatch[1]; + } else { + // It''s in an XPCNativeWrapper, so we have to do this black magic + resultset = doc.forms.namedItem(''main'').elements.namedItem(''resultset'').value; + } + + exportselect = ''record''; + newUri = ''http://''+host+''/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno='' + number + '':sessionid='' + sessionid + '':entitypagenum=35:0''; + + var uris = new Array(newUri); + } else { + var items = Scholar.Utilities.getItemArray(doc, doc, ''/WebZ/FSFETCH\\?fetchtype=fullrecord'', ''^(See more details for locating this item|Detailed Record)$''); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + // Set BookMark cookie + for(var i in items) { // Hack to get first item + var myCookie = sessionid+":"; + var rMatch = resultsetRegexp.exec(i); + var resultset = rMatch[1]; + break; + } + var uris = new Array(); + for(var i in items) { + var nMatch = numberRegexp.exec(i); + myCookie += resultset+"_"+nMatch[1]+","; + uris.push(i); + } + myCookie = myCookie.substr(0, myCookie.length-1); + doc.cookie = "BookMark="+myCookie; + + exportselect = ''marked''; + newUri = ''http://''+host+''/WebZ/DirectExport?numrecs=10:smartpage=directexport:entityexportnumrecs=10:entityexportresultset='' + resultset + '':entityexportrecno=1:sessionid='' + sessionid + '':entitypagenum=29:0''; + } + + Scholar.Utilities.HTTPUtilities.doPost(newUri, ''exportselect=''+exportselect+''&exporttype=plaintext'', null, function(text) { + Scholar.Utilities.debugPrint(text); + var lineRegexp = new RegExp(); + lineRegexp.compile("^([\\w() ]+): *(.*)$"); + + var k = 0; + var newItem = new Scholar.Item("book"); + newItem.source = uris[k]; + + var lines = text.split(''\n''); + for(var i=0;i<lines.length;i++) { + match = lineRegexp.exec(lines[i]); + if(lines[i] == "--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------") { + // new record + k++; + if(uris[k]) { + newItem.complete(); + newItem = new Scholar.Item("book"); + newItem.source = uris[k]; + } else { + break; } - model.addStatement(uri, prefixDC + ''title'', title); - } else if(match[1] == ''Author(s)'') { - var authors = match[2].split('';''); - if(authors) { - model.addStatement(uri, prefixDC + ''creator'', utilities.cleanAuthor(authors[0])); - for(var j=1; j<authors.length; j+=2) { - if(authors[j-1].substring(0, 1) == ''('') { - j++; + } else if(match) { + // is a useful match + if(match[1] == ''Title'') { + var title = match[2]; + if(!lineRegexp.test(lines[i+1])) { + i++; + title += '' ''+lines[i]; + } + if(title.substring(title.length-2) == " /") { + title = title.substring(0, title.length-2); + } + newItem.title = title; + } else if(match[1] == ''Author(s)'') { + var authors = match[2].split('';''); + if(authors) { + newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[0], "author" true)); + for(var j=1; j<authors.length; j+=2) { + if(authors[j-1].substring(0, 1) == ''('') { + // ignore places where there are parentheses + j++; + } + newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author", true)); } - model.addStatement(uri, prefixDC + ''creator'', utilities.cleanAuthor(authors[j])); + } else { + newItem.creators.push(Scholar.Utilities.trimString(match[2])); } - } else { - model.addStatement(uri, prefixDC + ''creator'', utilities.trimString(match[2])); - } - } else if(match[1] == ''Publication'') { - // Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it. - match[2] = utilities.trimString(match[2]); - if(match[2].substring(match[2].length-1) == '','') { - match[2] = match[2].substring(0, match[2].length-1); - } - model.addStatement(uri, prefixDC + ''publisher'', match[2]); - } else if(match[1] == ''Language'') { - model.addStatement(uri, prefixDC + ''language'', utilities.trimString(match[2])); - } else if(match[1] == ''Standard No'') { - var identifiers = match[2].split(/ +/); - var j=0; - while(j<(identifiers.length-1)) { - var type = identifiers[j].substring(0, identifiers[j].length-1); - var lastChar; - var value; - - j++; - while(j<identifiers.length && (lastChar = identifiers[j].substring(identifiers[j].length-1)) != '':'') { - if(identifiers[j].substring(0, 1) != ''('') { - if(lastChar == '';'') { - value = identifiers[j].substring(0, identifiers[j].length-1); - } else { - value = identifiers[j]; + } else if(match[1] == ''Publication'') { + // Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it. + match[2] = Scholar.Utilities.trimString(match[2]); + if(match[2].substring(match[2].length-1) == '','') { + match[2] = match[2].substring(0, match[2].length-1); + } + newItem.publisher = match[2]; + /*} else if(match[1] == ''Language'') { + .addStatement(uri, prefixDC + ''language'', Scholar.Utilities.trimString(match[2]));*/ + } else if(match[1] == ''Standard No'') { + var identifiers = match[2].split(/ +/); + var j=0; + while(j<(identifiers.length-1)) { + var type = identifiers[j].substring(0, identifiers[j].length-1); + var lastChar; + var value; + + j++; + while(j<identifiers.length && (lastChar = identifiers[j].substring(identifiers[j].length-1)) != '':'') { + if(identifiers[j].substring(0, 1) != ''('') { + if(lastChar == '';'') { + value = identifiers[j].substring(0, identifiers[j].length-1); + } else { + value = identifiers[j]; + } + if(type == "ISBN" || type == "ISSN") { + newItem[type] = value; + } } - model.addStatement(uri, prefixDC + ''identifier'', type + '' '' + value); + j++; } - j++; - } + } + } else if(match[1] == ''Year'') { + newItem.year = match[2]; } - } else if(match[1] == ''Year'') { - model.addStatement(uri, prefixDC + ''year'', match[2]); } } - } - - done(); -}) -wait();'); + + newItem.complete(); + + Scholar.done(); + }) + Scholar.wait(); +}'); -REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 3, 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi', -'var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options; -for(i in export_options) { - if(export_options[i].text == ''Latin1 MARC'' - || export_options[i].text == ''Raw MARC'' - || export_options[i].text == ''UTF-8'' - || export_options[i].text == ''MARC (Unicode/UTF-8)'' - || export_options[i].text == ''MARC (non-Unicode/MARC-8)'') { - // We have an exportable single record - if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) { - return "multiple"; - } else { - return "book"; +REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 4, 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi', +'function detect(doc, url) { + var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options; + for(var i in export_options) { + if(export_options[i].text == ''Latin1 MARC'' + || export_options[i].text == ''Raw MARC'' + || export_options[i].text == ''UTF-8'' + || export_options[i].text == ''MARC (Unicode/UTF-8)'' + || export_options[i].text == ''MARC (non-Unicode/MARC-8)'') { + // We have an exportable single record + if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) { + return "multiple"; + } else { + return "book"; + } } } -} -return false;', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var uri = doc.location.href; -var postString = ''''; -var form = doc.forms.namedItem(''frm''); -var newUri = form.action; -var multiple = false; - -if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) { - multiple = true; +}', +'function doWeb(doc, url) { + var postString = ''''; + var form = doc.forms.namedItem(''frm''); + var newUri = form.action; + var multiple = false; - var availableItems = new Object(); // Technically, associative arrays are objects + if(doc.forms.namedItem(''frm'').elements.namedItem(''RC'')) { + multiple = true; - var namespace = doc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - - // Require link to match this - var tagRegexp = new RegExp(); - tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti=''); - // Do not allow text to match this - var rejectRegexp = new RegExp(); - rejectRegexp.compile(''\[ [0-9]+ \]''); - - var checkboxes = new Array(); - var urls = new Array(); - - var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/table/tbody/tr[td/input[@type="checkbox"]]'', nsResolver); - // Go through table rows - for(var i=0; i<tableRows.length; i++) { - // CHK is what we need to get it all as one file - var input = utilities.getNode(doc, tableRows[i], ''./td/input[@name="CHK"]'', nsResolver); - checkboxes[i] = input.value; - var links = utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); - urls[i] = links[0].href; - utilities.debugPrint(urls[i]+" = "+links[0].href); - // Go through links - for(var j=0; j<links.length; j++) { - if(tagRegexp.test(links[j].href)) { - var text = utilities.getNodeString(doc, links[j], ''.//text()'', null); - if(text) { - text = utilities.cleanString(text); - if(!rejectRegexp.test(text)) { - if(availableItems[i]) { - availableItems[i] += " "+text; - } else { - availableItems[i] = text; + var availableItems = new Object(); // Technically, associative arrays are objects + + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + // Require link to match this + var tagRegexp = new RegExp(); + tagRegexp.compile(''Pwebrecon\\.cgi\\?.*v1=[0-9]+\\&.*ti=''); + // Do not allow text to match this + var rejectRegexp = new RegExp(); + rejectRegexp.compile(''\[ [0-9]+ \]''); + + var checkboxes = new Array(); + var urls = new Array(); + + var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/table/tbody/tr[td/input[@type="checkbox"]]'', nsResolver); + // Go through table rows + for(var i=0; i<tableRows.length; i++) { + // CHK is what we need to get it all as one file + var input = Scholar.Utilities.getNode(doc, tableRows[i], ''./td/input[@name="CHK"]'', nsResolver); + checkboxes[i] = input.value; + var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); + urls[i] = links[0].href; + // Go through links + for(var j=0; j<links.length; j++) { + if(tagRegexp.test(links[j].href)) { + var text = Scholar.Utilities.getNodeString(doc, links[j], ''.//text()'', null); + if(text) { + text = Scholar.Utilities.cleanString(text); + if(!rejectRegexp.test(text)) { + if(availableItems[i]) { + availableItems[i] += " "+text; + } else { + availableItems[i] = text; + } } } } } } + + var items = Scholar.selectItems(availableItems); + if(!items) { + return true; + } + + // add arguments for items we need to grab + for(var i in items) { + postString += "CHK="+checkboxes[i]+"&"; + } } - var items = utilities.selectItems(availableItems); - if(!items) { - return true; - } + var raw, unicode, latin1; - // add arguments for items we need to grab - for(i in items) { - postString += "CHK="+checkboxes[i]+"&"; - } -} - -var raw, unicode, latin1; - -for(i in form.elements) { - if(form.elements[i].type == ''HIDDEN'' || form.elements[i].type == ''hidden'') { - postString += escape(form.elements[i].name)+''=''+escape(form.elements[i].value)+''&''; - } -} - -var export_options = form.elements.namedItem(''RD'').options; -for(i in export_options) { - if(export_options[i].text == ''Raw MARC'' - || export_options[i].text == ''MARC (non-Unicode/MARC-8)'') { - raw = i; - } if(export_options[i].text == ''Latin1 MARC'') { - latin1 = i; - } else if(export_options[i].text == ''UTF-8'' - || export_options[i].text == ''MARC (Unicode/UTF-8)'') { - unicode = i; + for(var i=0; i<form.elements.length; i++) { + if(form.elements[i].type && form.elements[i].type.toLowerCase() == ''hidden'') { + postString += escape(form.elements[i].name)+''=''+escape(form.elements[i].value)+''&''; + } } -} -postString += ''RD=''+i+''&MAILADDY=&SAVE=Press+to+SAVE+or+PRINT''; - -utilities.debugPrint(postString); - -// No idea why this doesn''t work as post -utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) { - var records = text.split("\x1D"); - for(var i=0; i<(records.length-1); i++) { - if(multiple) { - utilities.debugPrint("uri = urls["+i+"]"); - uri = urls[i]; - utilities.debugPrint("my uri = "+uri); - } - var record = new MARC_Record(); - record.load(records[i], "binary"); - utilities.importMARCRecord(record, uri, model); - } - done(); -}) -wait();'); - -REPLACE INTO "translators" VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-26 16:01:00', 3, 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)', -'var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -// See if this is a seach results page -if(doc.title == "JSTOR: Search Results") { - return "multiple"; -} - -// If this is a view page, find the link to the citation -var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -if(!elmts.length) { - var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -} -if(elmts && elmts.length) { - return "journalArticle"; -} -return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -var uri = doc.location.href; -var saveCitations = new Array(); - -if(doc.title == "JSTOR: Search Results") { - var availableItems = new Object(); - - // Require link to match this - var tagRegexp = new RegExp(); - tagRegexp.compile(''citationAction=''); - - var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/div[@class="indent"]/table/tbody/tr[td/span[@class="printDownloadSaveLinks"]]'', nsResolver); - // Go through table rows - for(var i=0; i<tableRows.length; i++) { - var links = utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); - // Go through links - for(var j=0; j<links.length; j++) { - if(tagRegexp.test(links[j].href)) { - var text = utilities.getNode(doc, tableRows[i], ''.//strong/text()'', null); - if(text && text.nodeValue) { - text = utilities.cleanString(text.nodeValue); - if(availableItems[links[j].href]) { - availableItems[links[j].href] += " "+text; - } else { - availableItems[links[j].href] = text; - } - } - } + + var export_options = form.elements.namedItem(''RD'').options; + for(var i=0; i<export_options.length; i++) { + if(export_options[i].text == ''Raw MARC'' + || export_options[i].text == ''MARC (non-Unicode/MARC-8)'') { + raw = i; + } if(export_options[i].text == ''Latin1 MARC'') { + latin1 = i; + } else if(export_options[i].text == ''UTF-8'' + || export_options[i].text == ''MARC (Unicode/UTF-8)'') { + unicode = i; } } - var items = utilities.selectItems(availableItems); - if(!items) { - return true; + if(unicode) { + var rd = unicode; + } else if(latin1) { + var rd = latin1; + } else if(raw) { + var rd = raw; + } else { + return false; } - for(i in items) { - saveCitations.push(i.replace(''citationAction=remove'', ''citationAction=save'')); + postString += ''RD=''+rd+''&MAILADDY=&SAVE=Press+to+SAVE+or+PRINT''; + + // No idea why this doesn''t work as post + Scholar.Utilities.HTTPUtilities.doGet(newUri+''?''+postString, null, function(text) { + // load translator for MARC + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + marc.Scholar.write(text); + marc.Scholar.eof(); + marc.doImport(url); + + Scholar.done(); + }) + Scholar.wait(); +}'); + +REPLACE INTO "translators" VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-26 16:01:00', 4, 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)', +'function detect(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + // See if this is a seach results page + if(doc.title == "JSTOR: Search Results") { + return "multiple"; } -} else { + // If this is a view page, find the link to the citation var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); if(!elmts.length) { var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); } - var saveCitation = elmts[0].href; - var viewSavedCitations = elmts[1].href; - saveCitations.push(saveCitation.replace(''citationAction=remove'', ''citationAction=save'')); -} - -function getList(urls, each, done, error) { + if(elmts && elmts.length) { + return "journalArticle"; + } +}', +'function getList(urls, each, done, error) { var url = urls.shift(); - utilities.HTTPUtilities.doGet(url, null, function(text) { + Scholar.Utilities.HTTPUtilities.doGet(url, null, function(text) { if(each) { each(text); } @@ -494,564 +452,590 @@ function getList(urls, each, done, error) { }, error); } -function newDataObject() { - var data = new Object(); - data[prefixDC + "title"] = new Array(); - data[prefixDC + "creator"] = new Array(); - data[prefixDummy + "publication"] = new Array(); - data[prefixDummy + "volume"] = new Array(); - data[prefixDummy + "number"] = new Array(); - data[prefixDummy + "series"] = new Array(); - data[prefixDC + "date"] = new Array(); - data[prefixDummy + "pages"] = new Array(); - data[prefixDC + "identifier"] = new Array(); - data[prefixDC + "publisher"] = new Array(); - return data; +function itemComplete(newItem, url) { + if(!newItem.source) { + if(newItem.ISSN) { + newItem.source = "http://www.jstor.org/browse/"+newItem.ISSN; + } else { + newItem.source = url; + } + } + newItem.complete(); } -utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse?citationAction=removeAll&confirmRemAll=on&viewCitations=1'', null, function() { // clear marked - // Mark all our citations - getList(saveCitations, null, function() { // mark this - utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse/citations.txt?exportAction=Save+as+Text+File&exportFormat=cm&viewCitations=1'', null, function(text) { - // get marked - var k = 0; - var lines = text.split("\n"); - var haveStarted = false; - var data = newDataObject(); - var newItemRe = /^<[0-9]+>/; - var stableURL, ISSN; - - for(i in lines) { - if(lines[i].substring(0,3) == "<1>") { - haveStarted = true; - } else if(newItemRe.test(lines[i])) { - if(!stableURL) { - if(ISSN) { - stableURL = "http://www.jstor.org/browse/"+ISSN; - } else { // Just make sure it''s unique - stableURL = k; - k++; - } - } - model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journalArticle", false); - for(i in data) { - if(data[i].length) { - for(j in data[i]) { - model.addStatement(stableURL, i, data[i][j]); - } +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + var saveCitations = new Array(); + + if(doc.title == "JSTOR: Search Results") { + var availableItems = new Object(); + + // Require link to match this + var tagRegexp = new RegExp(); + tagRegexp.compile(''citationAction=''); + + var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/div[@class="indent"]/table/tbody/tr[td/span[@class="printDownloadSaveLinks"]]'', nsResolver); + // Go through table rows + for(var i=0; i<tableRows.length; i++) { + var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); + // Go through links + for(var j=0; j<links.length; j++) { + if(tagRegexp.test(links[j].href)) { + var text = Scholar.Utilities.getNode(doc, tableRows[i], ''.//strong/text()'', null); + if(text && text.nodeValue) { + text = Scholar.Utilities.cleanString(text.nodeValue); + if(availableItems[links[j].href]) { + availableItems[links[j].href] += " "+text; + } else { + availableItems[links[j].href] = text; } } - var data = newDataObject(); - delete ISSN; - delete stableURL; - } else if(lines[i].substring(2, 5) == " : " && haveStarted) { - var fieldCode = lines[i].substring(0, 2); - var fieldContent = utilities.cleanString(lines[i].substring(5)) - - if(fieldCode == "TI") { - data[prefixDC + "title"].push(fieldContent); - } else if(fieldCode == "AU") { - var authors = fieldContent.split(";"); - for(j in authors) { - var author = authors[j]; - if(author) { - var splitNames = author.split('', ''); - if(splitNames) { - author = splitNames[1]+'' ''+splitNames[0]; + } + } + } + + var items = Scholar.selectItems(availableItems); + if(!items) { + return true; + } + + for(var i in items) { + saveCitations.push(i.replace(''citationAction=remove'', ''citationAction=save'')); + } + } else { + // If this is a view page, find the link to the citation + var xpath = ''/html/body/div[@class="indent"]/center/font/p/a[@class="nav"]''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + if(!elmts.length) { + var xpath = ''/html/body/div[@class="indent"]/center/p/font/a[@class="nav"]''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + } + var saveCitation = elmts[0].href; + var viewSavedCitations = elmts[1].href; + saveCitations.push(saveCitation.replace(''citationAction=remove'', ''citationAction=save'')); + } + + Scholar.Utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse?citationAction=removeAll&confirmRemAll=on&viewCitations=1'', null, function() { // clear marked + // Mark all our citations + getList(saveCitations, null, function() { // mark this + Scholar.Utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse/citations.txt?exportAction=Save+as+Text+File&exportFormat=cm&viewCitations=1'', null, function(text) { + // get marked + var k = 0; + var lines = text.split("\n"); + var haveStarted = false; + var newItemRe = /^<[0-9]+>/; + + var newItem = new Scholar.Item("journalArticle"); + + for(var i in lines) { + if(lines[i].substring(0,3) == "<1>") { + haveStarted = true; + } else if(newItemRe.test(lines[i])) { + itemComplete(newItem, url); + newItem = new Scholar.Item("journalArticle"); + } else if(lines[i].substring(2, 5) == " : " && haveStarted) { + var fieldCode = lines[i].substring(0, 2); + var fieldContent = Scholar.Utilities.cleanString(lines[i].substring(5)) + + if(fieldCode == "TI") { + newItem.title = fieldContent; + } else if(fieldCode == "AU") { + var authors = fieldContent.split(";"); + for(j in authors) { + if(authors[j]) { + newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author", true)); } - data[prefixDC + "creator"].push(author); } + } else if(fieldCode == "SO") { + newItem.publication = fieldContent; + } else if(fieldCode == "VO") { + newItem.volume = fieldContent; + } else if(fieldCode == "NO") { + newItem.number = fieldContent; + } else if(fieldCode == "SE") { + newItem.series = fieldContent; + } else if(fieldCode == "DA") { + var date = new Date(fieldContent.replace(".", "")); + if(isNaN(date.valueOf())) { + newItem.date = fieldContent; + } else { + newItem.date = Scholar.Utilities.dateToISO(date); + } + } else if(fieldCode == "PP") { + newItem.pages = fieldContent; + } else if(fieldCode == "EI") { + newItem.source = fieldContent; + } else if(fieldCode == "IN") { + newItem.ISSN = fieldContent; + } else if(fieldCode == "PB") { + newItem.publisher = fieldContent; } - } else if(fieldCode == "SO") { - data[prefixDummy + "publication"].push(fieldContent); - } else if(fieldCode == "VO") { - data[prefixDummy + "volume"].push(fieldContent); - } else if(fieldCode == "NO") { - data[prefixDummy + "number"].push(fieldContent); - } else if(fieldCode == "SE") { - data[prefixDummy + "series"].push(fieldContent); - } else if(fieldCode == "DA") { - var date = new Date(fieldContent.replace(".", "")); - if(isNaN(date.valueOf())) { - data[prefixDC + "date"].push(fieldContent); - } else { - data[prefixDC + "date"].push(utilities.dateToISO(date)); - } - } else if(fieldCode == "PP") { - data[prefixDummy + "pages"].push(fieldContent); - } else if(fieldCode == "EI") { - stableURL = fieldContent; - } else if(fieldCode == "IN") { - data[prefixDC + "identifier"].push("ISSN "+fieldContent); - ISSN = fieldContent; - } else if(fieldCode == "PB") { - data[prefixDC + "publisher"].push(fieldContent); } } - } - - // Loop through again so that we can add with the stableURL - if(!stableURL) { - if(ISSN) { - stableURL = "http://www.jstor.org/browse/"+ISSN; - } else { // Just make sure it''s unique - stableURL = k; - k++; - } - } - model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journalArticle", false); - for(i in data) { - if(data[i].length) { - for(j in data[i]) { - model.addStatement(stableURL, i, data[i][j]); - } + + // last item is complete + if(haveStarted) { + itemComplete(newItem, url); } - } - - done(); - }); - }, function() {}); -}); - -wait();'); + + Scholar.done(); + }); + }, function() {}); + }); + + Scholar.wait(); +}'); -REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 3, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', -'if(doc.title == "History Cooperative: Search Results") { - return "multiple"; -} else { - return "journalArticle"; +REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', +'function detect(doc, url) { + if(doc.title == "History Cooperative: Search Results") { + return "multiple"; + } else { + return "journalArticle"; + } }', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -function associateMeta(uri, metaTags, field, rdfUri) { +'function associateMeta(newItem, metaTags, field, scholarField) { var field = metaTags.namedItem(field); if(field) { - model.addStatement(uri, rdfUri, field.getAttribute("content"), false); + newItem[scholarField] = field.getAttribute("content"); } } function scrape(doc) { - var uri = doc.location.href; + var newItem = new Scholar.Item("journalArticle"); + newItem.source = doc.location.href; + var month, year; var metaTags = doc.getElementsByTagName("meta"); - associateMeta(uri, metaTags, "Title", prefixDC + "title"); - associateMeta(uri, metaTags, "Journal", prefixDummy + "publication"); - associateMeta(uri, metaTags, "Volume", prefixDummy + "volume"); - associateMeta(uri, metaTags, "Issue", prefixDummy + "number"); + associateMeta(newItem, metaTags, "Title", "title"); + associateMeta(newItem, metaTags, "Journal", "publication"); + associateMeta(newItem, metaTags, "Volume", "volume"); + associateMeta(newItem, metaTags, "Issue", "number"); var author = metaTags.namedItem("Author"); if(author) { var authors = author.getAttribute("content").split(" and "); for(j in authors) { - model.addStatement(uri, prefixDC + "creator", authors[j], false); + newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author")); } } - var month = metaTags.namedItem("PublicationMonth"); + newItem.complete(); + + // don''t actually need date info for a journal article + /*var month = metaTags.namedItem("PublicationMonth"); var year = metaTags.namedItem("PublicationYear"); if(month && year) { - model.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false); - } - - model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); + odel.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false); + }*/ } -if(doc.title == "History Cooperative: Search Results") { - var items = utilities.getItemArray(doc, doc, ''^http://[^/]+/journals/.+/.+/.+\.html$''); - items = utilities.selectItems(items); - - if(!items) { - return true; +function doWeb(doc, url) { + if(doc.title == "History Cooperative: Search Results") { + var items = Scholar.Utilities.getItemArray(doc, doc, ''^http://[^/]+/journals/.+/.+/.+\.html$''); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(var i in items) { + uris.push(i); + } + + Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + function() { Scholar.done(); }, function() {}); + + Scholar.wait(); + } else { + scrape(doc); } - - var uris = new Array(); - for(i in items) { - uris.push(i); - } - - utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, - function() { done(); }, function() {}); - - wait(); -} else { - scrape(doc); }'); -REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-28 22:52:00', 3, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', -'// First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button -var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); -if(matchRegexp.test(doc.location.href)) { - return "book"; -} -// Next, look for the MARC button -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -var xpath = ''//a[img[@alt="MARC Display"]]''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -if(elmts.length) { - return "book"; -} -// Also, check for links to an item display page -var tags = doc.getElementsByTagName("a"); -for(i=0; i<tags.length; i++) { - if(matchRegexp.test(tags[i].href)) { - return "multiple"; +REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-28 22:52:00', 4, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', +'function detect(doc, url) { + // First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button + var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); + if(matchRegexp.test(doc.location.href)) { + return "book"; } -} -return false; -', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - - -var uri = doc.location.href; -var newUri; - -var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); -var m = matchRegexp.exec(uri); -if(m) { - newUri = m[1]+''marc''+m[2]; -} else { + // Next, look for the MARC button var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; - + var xpath = ''//a[img[@alt="MARC Display"]]''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); if(elmts.length) { - newUri = elmts[0].href; + return "book"; } -} - -if(newUri) { - utilities.loadDocument(newUri, browser, function(newBrowser) { - newDoc = newBrowser.contentDocument; - - var namespace = newDoc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - - var xpath = ''//pre''; - var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - - var text = utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue; - - var record = new MARC_Record(); - record.load(text, "MARC_PAC"); - utilities.importMARCRecord(record, uri, model); - done(); - }, function() {}); -} else { // Search results page - // Require link to match this - var tagRegexp = new RegExp(); - tagRegexp.compile(''^http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/frameset''); - - var checkboxes = new Array(); - var urls = new Array(); - var availableItems = new Array(); - - var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''//table[@class="browseScreen"]//tr[td/input[@name="save"]]'', nsResolver); - // Go through table rows - for(var i=0; i<tableRows.length; i++) { - // CHK is what we need to get it all as one file - var input = utilities.getNode(doc, tableRows[i], ''./td/input[@name="save"]'', nsResolver); - checkboxes[i] = input.value; - var links = utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); - urls[i] = links[0].href; - // Go through links - for(var j=0; j<links.length; j++) { - if(tagRegexp.test(links[j].href)) { - var text = utilities.getNodeString(doc, links[j], ''.//text()'', null); - if(text) { - text = utilities.cleanString(text); - if(availableItems[i]) { - availableItems[i] += " "+text; - } else { - availableItems[i] = text; - } - } - } + // Also, check for links to an item display page + var tags = doc.getElementsByTagName("a"); + for(var i=0; i<tags.length; i++) { + if(matchRegexp.test(tags[i].href)) { + return "multiple"; } } - var items = utilities.selectItems(availableItems); - - if(!items) { - return true; - } + return false; +}', +'function doWeb(doc, url) { + var uri = doc.location.href; + var newUri; - var urlRe = new RegExp("^(http://[^/]+(/search/[^/]+/))"); - var m = urlRe.exec(urls[0]); - var clearUrl = m[0]+"?clear_saves=1"; - var postUrl = m[0]; - var exportUrl = m[1]+"++export/1,-1,-1,B/export"; + var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); + var m = matchRegexp.exec(uri); + if(m) { + newUri = m[1]+''marc''+m[2]; + } else { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; - var postString = ""; - for(i in items) { - postString += "save="+checkboxes[i]+"&"; - } - /*var hiddens = utilities.gatherElementsOnXPath(doc, doc, ''//form[@action="''+actionUrl+''"]//input[@type="hidden"]'', nsResolver); - for(var i=0; i<hiddens.length; i++) { - if(hiddens[i].name != "save_func") { - postString += hiddens[i].name+"="+hiddens[i].value+"&"; + var xpath = ''//a[img[@alt="MARC Display"]]''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + if(elmts.length) { + newUri = elmts[0].href; } - }*/ - postString += "save_func=save_marked"; + } + // load translator for MARC + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - utilities.HTTPUtilities.doGet(clearUrl, null, function() { - utilities.HTTPUtilities.doPost(postUrl, postString, null, function() { - utilities.HTTPUtilities.doPost(exportUrl, "ex_format=50&ex_device=45&SUBMIT=Submit", null, function(text) { - var records = text.split("\x1D"); - for(var i=0; i<(records.length-1); i++) { - var record = new MARC_Record(); - record.load(records[i], "binary"); - utilities.importMARCRecord(record, urls[i], model); + if(newUri) { // single page + Scholar.Utilities.loadDocument(newUri, function(newBrowser) { + newDoc = newBrowser.contentDocument; + + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var xpath = ''//pre''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); + + var text = Scholar.Utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue; + + var newItem = new Scholar.Item(); + newItem.source = uri; + + var record = new marc.MARC_Record(); + + var linee = text.split("\n"); + for (var i=0; i<linee.length; i++) { + linee[i] = linee[i].replace(/\xA0|_|\t/g,'' ''); + linee[i] = Scholar.Utilities.cleanString(linee[i]); + if (linee[i] == '''') continue; // jumps empty lines + var replacer = record.subfield_delimiter+''$1''; + linee[i] = linee[i].replace(/\|(.)/g,replacer); + linee[i] = linee[i].replace(/\|/g,this.subfield_delimiter); + var tag = linee[i].substr(0,3); + var ind1 = linee[i].substr(4,1); + var ind2 = linee[i].substr(5,1); + var value = record.subfield_delimiter+''a''+linee[i].substr(7); + if(linee[i].substr(0, 6) == "LEADER") { + value = linee[i].substr(7); + record.leader.record_length = ''00000''; + record.leader.record_status = value.substr(5,1); + record.leader.type_of_record = value.substr(6,1); + record.leader.bibliographic_level = value.substr(7,1); + record.leader.type_of_control = value.substr(8,1); + record.leader.character_coding_scheme = value.substr(9,1); + record.leader.indicator_count = ''2''; + record.leader.subfield_code_length = ''2''; + record.leader.base_address_of_data = ''00000''; + record.leader.encoding_level = value.substr(17,1); + record.leader.descriptive_cataloging_form = value.substr(18,1); + record.leader.linked_record_requirement = value.substr(19,1); + record.leader.entry_map = ''4500''; + + record.directory = ''''; + record.directory_terminator = record.field_terminator; + record.variable_fields = new Array(); + } + else if (tag > ''008'' && tag < ''899'') { // jumps low and high tags + if (tag != ''040'') record.add_field(tag,ind1,ind2,value); + } + } + + record.translate(newItem); + newItem.complete(); + + Scholar.done(); + }, function() {}); + } else { // Search results page + // Require link to match this + var tagRegexp = new RegExp(); + tagRegexp.compile(''^http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/frameset''); + + var checkboxes = new Array(); + var urls = new Array(); + var availableItems = new Array(); + + var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//table[@class="browseScreen"]//tr[td/input[@type="checkbox"]]'', nsResolver); + // Go through table rows + for(var i=0; i<tableRows.length; i++) { + // CHK is what we need to get it all as one file + var input = Scholar.Utilities.getNode(doc, tableRows[i], ''./td/input[@type="checkbox"]'', nsResolver); + checkboxes[i] = input.name+"="+escape(input.value); + var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); + urls[i] = links[0].href; + // Go through links + for(var j=0; j<links.length; j++) { + if(tagRegexp.test(links[j].href)) { + var text = Scholar.Utilities.getNodeString(doc, links[j], ''.//text()'', null); + if(text) { + text = Scholar.Utilities.cleanString(text); + if(availableItems[i]) { + availableItems[i] += " "+text; + } else { + availableItems[i] = text; + } + } } - done(); + } + } + + var items = Scholar.selectItems(availableItems); + + if(!items) { + return true; + } + + var urlRe = new RegExp("^(http://[^/]+(/search/[^/]+/))"); + var m = urlRe.exec(urls[0]); + var clearUrl = m[0]+"?clear_saves=1"; + var postUrl = m[0]; + var exportUrl = m[1]+"++export/1,-1,-1,B/export"; + + var postString = ""; + for(var i in items) { + postString += checkboxes[i]+"&"; + } + postString += "save_func=save_marked"; + + + Scholar.Utilities.HTTPUtilities.doGet(clearUrl, null, function() { + Scholar.Utilities.HTTPUtilities.doPost(postUrl, postString, null, function() { + Scholar.Utilities.HTTPUtilities.doPost(exportUrl, "ex_format=50&ex_device=45&SUBMIT=Submit", null, function(text) { + marc.Scholar.write(text); + marc.Scholar.eof(); + marc.doImport(url); + + Scholar.done(); + }); }); }); - }); -} - -wait();'); - -REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-26 16:01:00', 3, 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', -'var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -if(elmts.length) { - return "book"; -} -var xpath = ''//td[@class="searchsum"]/table''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -if(elmts.length) { - return "multiple"; -} - -return false;', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; + } -var data = new Object(); + Scholar.wait(); +}'); -function scrape(doc) { - var uri = doc.location.href; +REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-26 16:01:00', 4, 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', +'function detect(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + if(elmts.length) { + return "book"; + } + var xpath = ''//td[@class="searchsum"]/table''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + if(elmts.length) { + return "multiple"; + } +}', +'function scrape(doc) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); if(!elmts.length) { return false; } + + var newItem = new Scholar.Item("book"); + newItem.source = doc.location.href; + for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; try { - var node = utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver); + var node = Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver); if(!node) { - var node = utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver); + var node = Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver); } if(node) { - var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue); + var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue); field = field.toLowerCase(); - var value = utilities.superCleanString(node.nodeValue); - var rdfUri = null; + var value = Scholar.Utilities.superCleanString(node.nodeValue); if(field == "publisher") { - rdfUri = prefixDC + ''publisher''; + newItem.publisher = value; } else if(field == "pub date") { - rdfUri = prefixDC + ''year''; - var re = /[0-9]+/; var m = re.exec(value); - value = m[0]; + newItem.year = m[0]; } else if(field == "isbn") { - rdfUri = prefixDC + ''identifier''; - var re = /^[0-9](?:[0-9X]+)/; var m = re.exec(value); - value = "ISBN "+m[0]; + newItem.ISBN = m[0]; } else if(field == "title") { - rdfUri = prefixDC + ''title''; var titleParts = value.split(" / "); - value = titleParts[0]; + newItem.title = titleParts[0]; } else if(field == "publication info") { - rdfUri = prefixDummy + ''place''; var pubParts = value.split(" : "); - value = pubParts[0]; + newItem.place = pubParts[0]; } else if(field == "personal author") { - rdfUri = prefixDC + ''creator''; - value = utilities.cleanAuthor(node.nodeValue); + newItem.creators.push(Scholar.Utilities.cleanAuthor(value, "author", true)); } else if(field == "added author") { - rdfUri = prefixDC + ''contributor''; - value = utilities.cleanAuthor(node.nodeValue); + newItem.creators.push(Scholar.Utilities.cleanAuthor(value, "contributor", true)); } else if(field == "corporate author") { - rdfUri = prefixDummy + ''corporateCreator''; - } - if(rdfUri) { - var insert = true; - if(data && data[rdfUri]) { - for(j in data[rdfUri]) { - if(data[rdfUri][j] == value) { - insert = false; - break; - } - } - } else if(!data[rdfUri]) { - data[rdfUri] = new Array(); - } - if(insert) { - data[rdfUri].push(value); - model.addStatement(uri, rdfUri, value, true); - } + newItem.creators.push({lastName:author}); } } } catch (e) {} } - var callNumber = utilities.getNode(doc, doc, ''//tr/td[1][@class="holdingslist"]/text()'', nsResolver); + var callNumber = Scholar.Utilities.getNode(doc, doc, ''//tr/td[1][@class="holdingslist"]/text()'', nsResolver); if(callNumber && callNumber.nodeValue) { - model.addStatement(uri, prefixDC + "identifier", "CN "+callNumber.nodeValue, true); + newItem.callNumber = callNumber.nodeValue; } - model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); + newItem.complete(); return true; } -if(!scrape(doc)) { - var checkboxes = new Array(); - var urls = new Array(); - var availableItems = new Array(); - - var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''//td[@class="searchsum"]/table[//input[@value="Details"]]'', nsResolver); - // Go through table rows - for(var i=1; i<tableRows.length; i++) { - var input = utilities.getNode(doc, tableRows[i], ''.//input[@value="Details"]'', nsResolver); - checkboxes[i] = input.name; - var text = utilities.getNodeString(doc, tableRows[i], ''.//label/strong//text()'', nsResolver); - if(text) { - availableItems[i] = text; +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + if(!scrape(doc)) { + var checkboxes = new Array(); + var urls = new Array(); + var availableItems = new Array(); + + var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//td[@class="searchsum"]/table[//input[@value="Details"]]'', nsResolver); + // Go through table rows + for(var i=1; i<tableRows.length; i++) { + var input = Scholar.Utilities.getNode(doc, tableRows[i], ''.//input[@value="Details"]'', nsResolver); + checkboxes[i] = input.name; + var text = Scholar.Utilities.getNodeString(doc, tableRows[i], ''.//label/strong//text()'', nsResolver); + if(text) { + availableItems[i] = text; + } } + + var items = Scholar.selectItems(availableItems); + + if(!items) { + return true; + } + + var hostRe = new RegExp("^http://[^/]+"); + var m = hostRe.exec(doc.location.href); + var hitlist = doc.forms.namedItem("hitlist"); + var baseUrl = m[0]+hitlist.getAttribute("action")+"?first_hit="+hitlist.elements.namedItem("first_hit").value+"&last_hit="+hitlist.elements.namedItem("last_hit").value; + Scholar.Utilities.debugPrint(baseUrl); + + var uris = new Array(); + for(var i in items) { + uris.push(baseUrl+"&"+checkboxes[i]+"=Details"); + } + + Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + function() { Scholar.done() }, function() {}); + + Scholar.wait(); } - - var items = utilities.selectItems(availableItems); - - if(!items) { - return true; - } - - var hostRe = new RegExp("^http://[^/]+"); - var m = hostRe.exec(doc.location.href); - var hitlist = doc.forms.namedItem("hitlist"); - var baseUrl = m[0]+hitlist.getAttribute("action")+"?first_hit="+hitlist.elements.namedItem("first_hit").value+"&last_hit="+hitlist.elements.namedItem("last_hit").value; - utilities.debugPrint(baseUrl); - - var uris = new Array(); - for(i in items) { - uris.push(baseUrl+"&"+checkboxes[i]+"=Details"); - } - - utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, - function() { done() }, function() {}); - - wait(); } '); -REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 3, 'ProQuest Scraper', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)', -'if(doc.title == "Results") { - return "magazineArticle"; -} else { - return "book"; +REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest Scraper', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)', +'function detect(doc, url) { + if(doc.title == "Results") { + return "magazineArticle"; + } else { + return "book"; + } }', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; +'function scrape(doc) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; -function scrape(doc) { - var uri = doc.location.href; + var newItem = new Scholar.Item(); + newItem.source = doc.location.href; // Title var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="headerBlack"]/strong//text()''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); var title = ""; for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; title += elmt.nodeValue; } if(title) { - model.addStatement(uri, prefixDC + ''title'', title, true); + newItem.title = title; } // Authors var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="textMedium"]/a/em''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; - - // Dirty hack to fix highlighted words - var xpath = ''.//text()''; - var author = ""; - var authorElmts = utilities.gatherElementsOnXPath(doc, elmt, xpath, nsResolver); - for (var j = 0; j < authorElmts.length; j++) { - var authorElmt = authorElmts[j]; - author += authorElmt.nodeValue; + // there are sometimes additional tags representing higlighting + var author = getNodeString(doc, links[j], ''.//text()'', null); + if(author) { + newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author", true)); } - model.addStatement(uri, prefixDC + ''creator'', utilities.cleanAuthor(author), true); } // Other info var xpath = ''/html/body/span[@class="textMedium"]/font/table/tbody/tr''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; - var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase(); + var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase(); if(field == "publication title") { - var publication = utilities.getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver); + var publication = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver); if(publication.nodeValue) { - model.addStatement(uri, prefixDummy + ''publication'', utilities.superCleanString(publication.nodeValue), true); + newItem.publication = Scholar.Utilities.superCleanString(publication.nodeValue); } - var place = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + + var place = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); if(place.nodeValue) { - model.addStatement(uri, prefixDummy + ''place'', utilities.superCleanString(place.nodeValue), true); + newItem.place = Scholar.Utilities.superCleanString(place.nodeValue); } - var date = utilities.getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver); + + var date = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver); if(date.nodeValue) { date = date.nodeValue; - var jsDate = new Date(utilities.superCleanString(date)); + var jsDate = new Date(Scholar.Utilities.superCleanString(date)); if(!isNaN(jsDate.valueOf())) { - date = utilities.dateToISO(jsDate); + date = Scholar.Utilities.dateToISO(jsDate); } - model.addStatement(uri, prefixDC + ''date'', date, true); + newItem.date = date; } - var moreInfo = utilities.getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver); + + var moreInfo = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver); if(moreInfo.nodeValue) { - moreInfo = utilities.superCleanString(moreInfo.nodeValue); + moreInfo = Scholar.Utilities.superCleanString(moreInfo.nodeValue); var parts = moreInfo.split(";\xA0"); var issueRegexp = /^(\w+)\.(?: |\xA0)?(.+)$/ @@ -1061,122 +1045,122 @@ function scrape(doc) { if(m) { var info = m[1].toLowerCase(); if(info == "vol") { - model.addStatement(uri, prefixDummy + ''volume'', utilities.superCleanString(m[2]), true); + newItem.volume = Scholar.Utilities.superCleanString(m[2]); } else if(info == "iss" || info == "no") { - model.addStatement(uri, prefixDummy + ''number'', utilities.superCleanString(m[2]), true); + newItem.number = Scholar.Utilities.superCleanString(m[2]); } } } - if(parts[1] && utilities.superCleanString(parts[1]).substring(0, 3).toLowerCase() == "pg.") { + if(parts[1] && Scholar.Utilities.superCleanString(parts[1]).substring(0, 3).toLowerCase() == "pg.") { var re = /[0-9\-]+/; var m = re.exec(parts[1]); if(m) { - model.addStatement(uri, prefixDummy + ''pages'', m[0], true); + newItem.pages = m[0]; } } } } else if(field == "source type") { - var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); if(value.nodeValue) { - value = utilities.superCleanString(value.nodeValue).toLowerCase(); - utilities.debugPrint(value); + value = Scholar.Utilities.superCleanString(value.nodeValue).toLowerCase(); + Scholar.Utilities.debugPrint(value); if(value.indexOf("periodical") >= 0) { - model.addStatement(uri, prefixRDF + "type", prefixDummy + "magazineArticle", false); + newItem.itemType = "magazineArticle"; } else if(value.indexOf("newspaper") >= 0) { - model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaperArticle", false); - } else { - model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); + newItem.itemType = "newspaperArticle"; + } else { // TODO: support thesis + newItem.itemType = "book"; } } } else if(field == "isbn" || field == "issn" || field == "issn/isbn") { - var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); if(value) { var type; - value = utilities.superCleanString(value.nodeValue); + value = Scholar.Utilities.superCleanString(value.nodeValue); if(value.length == 10 || value.length == 13) { - type = "ISBN"; + newItem.ISBN = value; } else if(value.length == 8) { - type = "ISSN"; - } - if(type) { - model.addStatement(uri, prefixDC + "identifier", type+" "+value, false); + newItem.ISSN = value; } } } } + + newItem.complete(); } -if(doc.title == "Results") { - var items = new Object(); - - // Require link to match this - var tagRegexp = new RegExp(); - tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12]|(?:.*&)Fmt=[12].*&did=)''); - - var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[@class="rowUnMarked"]/td[3][@class="textMedium"]'', nsResolver); - // Go through table rows - for(var i=0; i<tableRows.length; i++) { - var links = utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); - // Go through links - for(var j=0; j<links.length; j++) { - if(tagRegexp.test(links[j].href)) { - var text = utilities.getNode(doc, tableRows[i], ''./a[@class="bold"]/text()'', null); - if(text && text.nodeValue) { - text = utilities.cleanString(text.nodeValue); - items[links[j].href] = text; +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + if(doc.title == "Results") { + var items = new Object(); + + // Require link to match this + var tagRegexp = new RegExp(); + tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12]|(?:.*&)Fmt=[12].*&did=)''); + + var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[@class="rowUnMarked"]/td[3][@class="textMedium"]'', nsResolver); + // Go through table rows + for(var i=0; i<tableRows.length; i++) { + var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); + // Go through links + for(var j=0; j<links.length; j++) { + if(tagRegexp.test(links[j].href)) { + var text = Scholar.Utilities.getNode(doc, tableRows[i], ''./a[@class="bold"]/text()'', null); + if(text && text.nodeValue) { + text = Scholar.Utilities.cleanString(text.nodeValue); + items[links[j].href] = text; + } + break; } - break; } } - } - items = utilities.selectItems(items); - - if(!items) { - return true; - } - - var uris = new Array(); - for(i in items) { - uris.push(i); - } - - utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, - function() { done(); }, function() {}); - - wait(); -} else { - var fmtCheck = /(?:\&|\?)Fmt=([0-9]+)/ - var m = fmtCheck.exec(doc.location.href); - if(m && (m[1] == "1" || m[1] == "2")) { - scrape(doc); - } else if(m) { - utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), browser, function(browser) { scrape(browser.contentDocument); done(); }, function() {}); - wait(); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(var i in items) { + uris.push(i); + } + + Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + function() { Scholar.done(); }, function() {}); + + Scholar.wait(); + } else { + var fmtCheck = /(?:\&|\?)Fmt=([0-9]+)/ + var m = fmtCheck.exec(doc.location.href); + if(m && (m[1] == "1" || m[1] == "2")) { + scrape(doc); + } else if(m) { + Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(browser) { scrape(browser.contentDocument); Scholar.done(); }, function() {}); + Scholar.wait(); + } } }'); -REPLACE INTO "translators" VALUES ('6773a9af-5375-3224-d148-d32793884dec', '2006-06-26 16:01:00', 3, 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', -'if(doc.title.substring(0, 8) == "Article ") { - return "magazineArticle"; -} else doc.title.substring(0, 10) == "Citations ") { - return "multiple"; -} -return false;', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -function extractCitation(uri, elmts, title) { +REPLACE INTO "translators" VALUES ('6773a9af-5375-3224-d148-d32793884dec', '2006-06-26 16:01:00', 4, 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', +'function detect(doc, url) { + if(doc.title.substring(0, 8) == "Article ") { + return "magazineArticle"; + } else doc.title.substring(0, 10) == "Citations ") { + return "multiple"; + } +}', +'function extractCitation(uri, elmts, title) { + var newItem = new Scholar.Item(); + newItem.source = uri; + if(title) { - model.addStatement(uri, prefixDC + "title", utilities.superCleanString(title), true); + newItem.title = Scholar.Utilities.superCleanString(title); } for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; @@ -1184,118 +1168,127 @@ function extractCitation(uri, elmts, title) { var field = elmt.nodeValue.substring(1, colon).toLowerCase(); var value = elmt.nodeValue.substring(colon+1, elmt.nodeValue.length-1); if(field == "title") { - model.addStatement(uri, prefixDC + "title", utilities.superCleanString(value), true); + newItem.title = Scholar.Utilities.superCleanString(value); } else if(field == "journal") { - model.addStatement(uri, prefixDummy + "publication", value, true); + newItem.publication = value; } else if(field == "pi") { parts = value.split(" "); var date = ""; - var isDate = true; - var rdfUri, type; + var field = null; for(j in parts) { firstChar = parts[j].substring(0, 1); - rdfUri = false; if(firstChar == "v") { - rdfUri = prefixDummy + "volume"; - type = prefixDummy + "journalArticle"; + newItem.itemType = "journalArticle"; + field = "volume"; } else if(firstChar == "i") { - rdfUri = prefixDummy + "number"; + field = "issue"; } else if(firstChar == "p") { - rdfUri = prefixDummy + "pages"; - var pagesRegexp = /p(\w+)\((\w+)\)/; + field = "pages"; + + var pagesRegexp = /p(\w+)\((\w+)\)/; // weird looking page range var match = pagesRegexp.exec(parts[j]); - if(match) { + if(match) { // yup, it''s weird var finalPage = parseInt(match[1])+parseInt(match[2]) parts[j] = "p"+match[1]+"-"+finalPage.toString(); - } else if(!type) { + } else if(!type) { // no, it''s normal + // check to see if it''s numeric, bc newspaper pages aren''t var justPageNumber = parts[j].substr(1); if(parseInt(justPageNumber).toString() != justPageNumber) { - type = prefixDummy + "newspaperArticle"; + newItem.itemType = "newspaperArticle"; } } + } else if(!field) { // date parts at the beginning, before + // anything else + date += " "+parts[j]; } - if(rdfUri) { + if(field) { isDate = false; - if(parts[j] != "pNA") { // not a real page number - var content = parts[j].substring(1); - model.addStatement(uri, rdfUri, content, false); - } else if(!type) { - type = prefixDummy + "newspaperArticle"; + + if(parts[j] != "pNA") { // make sure it''s not an invalid + // page number + // chop of letter + newItem[field] = parts[j].substring(1); + } else if(!type) { // only newspapers are missing + // page numbers on infotrac + newItem.itemType = "newspaperArticle"; } - } else if(isDate) { - date += " "+parts[j]; } } // Set type - if(!type) { - type = prefixDummy + "magazineArticle"; + if(!newItem.itemType) { + newItem.itemType = "magazineArticle"; } - model.addStatement(uri, prefixRDF + "type", type, false); if(date != "") { - model.addStatement(uri, prefixDC + "date", date.substring(1), true); + newItem.date = date.substring(1); } } else if(field == "author") { - model.addStatement(uri, prefixDC + "creator", utilities.cleanAuthor(value), true); + newItem.creators.push(Scholar.Utilities.cleanAuthor(value, "author", true)); } } + + newItem.complete(); } +function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; -var uri = doc.location.href; -if(doc.title.substring(0, 8) == "Article ") { - var xpath = ''/html/body//comment()''; - var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - extractCitation(uri, elmts); -} else { - var items = new Array(); - var uris = new Array(); - var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body//table/tbody/tr/td[a/b]'', nsResolver); - // Go through table rows - for(var i=0; i<tableRows.length; i++) { - var link = utilities.getNode(doc, tableRows[i], ''./a'', nsResolver); - uris[i] = link.href; - var article = utilities.getNode(doc, link, ''./b/text()'', nsResolver); - items[i] = article.nodeValue; - // Chop off final period - if(items[i].substr(items[i].length-1) == ".") { - items[i] = items[i].substr(0, items[i].length-1); + var uri = doc.location.href; + if(doc.title.substring(0, 8) == "Article ") { // article + var xpath = ''/html/body//comment()''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + extractCitation(uri, elmts); + } else { // search results + var items = new Array(); + var uris = new Array(); + var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body//table/tbody/tr/td[a/b]'', nsResolver); + // Go through table rows + for(var i=0; i<tableRows.length; i++) { + var link = Scholar.Utilities.getNode(doc, tableRows[i], ''./a'', nsResolver); + uris[i] = link.href; + var article = Scholar.Utilities.getNode(doc, link, ''./b/text()'', nsResolver); + items[i] = article.nodeValue; + // Chop off final period + if(items[i].substr(items[i].length-1) == ".") { + items[i] = items[i].substr(0, items[i].length-1); + } + } + + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + for(var i in items) { + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ".//comment()", nsResolver); + extractCitation(uris[i], elmts, items[i]); } - } - - items = utilities.selectItems(items); - - if(!items) { - return true; - } - - for(i in items) { - var elmts = utilities.gatherElementsOnXPath(doc, tableRows[i], ".//comment()", nsResolver); - extractCitation(uris[i], elmts, items[i]); } }'); -REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-26 16:01:00', 3, 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/(?:document|doclist)', -'var detailRe = new RegExp("^http://[^/]+/universe/document"); -if(detailRe.test(doc.location.href)) { - return "newspaperArticle"; -} else { - return "multiple"; +REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-26 16:01:00', 4, 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/(?:document|doclist)', +'function detect(doc, url) { + var detailRe = new RegExp("^http://[^/]+/universe/document"); + if(detailRe.test(doc.location.href)) { + return "newspaperArticle"; + } else { + return "multiple"; + } }', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -function scrape(doc) { - var uri = doc.location.href; +'function scrape(doc) { + var newItem = new Scholar.Item(); + newItem.source = doc.location.href; var citationDataDiv; var divs = doc.getElementsByTagName("div"); - for(i in divs) { + for(var i=0; i<divs.length; i++) { if(divs[i].className == "bodytext") { citationDataDiv = divs[i]; break; @@ -1304,16 +1297,16 @@ function scrape(doc) { centerElements = citationDataDiv.getElementsByTagName("center"); var elementParts = centerElements[0].innerHTML.split(/<br[^>]*>/gi); - model.addStatement(uri, prefixDummy + "publication", elementParts[elementParts.length-1], true); + newItem.publication = elementParts[elementParts.length-1]; var dateRegexp = /<br[^>]*>(?:<b>)?([A-Z][a-z]+)(?:<\/b>)? ([0-9]+, [0-9]{4})/; var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML); if(m) { var jsDate = new Date(m[1]+" "+m[2]); - model.addStatement(uri, prefixDC + "date", utilities.dateToISO(jsDate), true); + newItem.date = Scholar.Utilities.dateToISO(jsDate); } else { var elementParts = centerElements[centerElements.length-1].innerHTML.split(/<br[^>]*>/gi); - model.addStatement(uri, prefixDC + "date", elementParts[1], true); + newItem.date = elementParts[1]; } var cutIndex = citationDataDiv.innerHTML.indexOf("<b>BODY:</b>"); @@ -1326,1147 +1319,1114 @@ function scrape(doc) { citationData = citationDataDiv.innerHTML; } - citationData = utilities.cleanTags(citationData); + citationData = Scholar.Utilities.cleanTags(citationData); var headlineRegexp = /\n(?:HEADLINE|TITLE|ARTICLE): ([^\n]+)\n/; var m = headlineRegexp.exec(citationData); if(m) { - model.addStatement(uri, prefixDC + "title", utilities.cleanTags(m[1]), true); + newItem.title = Scholar.Utilities.cleanTags(m[1]); } var bylineRegexp = /\nBYLINE: *(\w[\w\- ]+)/; var m = bylineRegexp.exec(citationData); - if(m) { + if(m) { // there is a byline; use it as an author if(m[1].substring(0, 3).toLowerCase() == "by ") { m[1] = m[1].substring(3); } - model.addStatement(uri, prefixDC + "creator", m[1], true); - model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaperArticle", false); - } else { - model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); + newItem.creators.push(Scholar.Utilities.cleanAuthor(m[1], "author")); + + newItem.itemType = "newspaperArticle"; + } else { // no byline; must be a journal + newItem.itemType = "journalArticle"; } - var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/; + // other ways authors could be encoded + var authorRegexp = /\n(?:AUTHOR|NAME): ([^\n]+)\n/; var m = authorRegexp.exec(citationData); if(m) { var authors = m[1].split(/, (?:and )?/); - for(i in authors) { - model.addStatement(uri, prefixDC + "creator", authors[i].replace(" *", ""), true); + for(var i in authors) { + newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[i].replace(" *", ""), "author")); } } + + newItem.complete(); } -var detailRe = new RegExp("^http://[^/]+/universe/document"); -if(detailRe.test(doc.location.href)) { - scrape(doc); -} else { - var items = utilities.getItemArray(doc, doc, "^http://[^/]+/universe/document"); - items = utilities.selectItems(items); - - if(!items) { - return true; - } - - var uris = new Array(); - for(i in items) { - uris.push(i); +function doWeb(doc, url) { + var detailRe = new RegExp("^http://[^/]+/universe/document"); + if(detailRe.test(doc.location.href)) { + scrape(doc); + } else { + var items = Scholar.Utilities.getItemArray(doc, doc, "^http://[^/]+/universe/document"); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(var i in items) { + uris.push(i); + } + + Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + function() { Scholar.done(); }, function() {}); + + Scholar.wait(); } - - utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, - function() { done(); }, function() {}); - - wait(); }'); -REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 3, 'Aleph Scraper', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)', -'var singleRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}"); - -if(singleRe.test(doc.location.href)) { - return "book"; -} else { - var tags = doc.getElementsByTagName("a"); - for(var i=0; i<tags.length; i++) { - if(singleRe.test(tags[i].href)) { - return "multiple"; +REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 4, 'Aleph Scraper', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)', +'function detect(doc, url) { + var singleRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}"); + + if(singleRe.test(doc.location.href)) { + return "book"; + } else { + var tags = doc.getElementsByTagName("a"); + for(var i=0; i<tags.length; i++) { + if(singleRe.test(tags[i].href)) { + return "multiple"; + } } } -} -return false;', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var detailRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}"); -var uri = doc.location.href; -var newUris = new Array(); - -if(detailRe.test(uri)) { +}', +'function doWeb(doc, url) { + var detailRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}"); + var uri = doc.location.href; + var newUris = new Array(); + + if(detailRe.test(uri)) { newUris.push(uri.replace(/\&format=[0-9]{3}/, "&format=001")) -} else { - var items = utilities.getItemArray(doc, doc, ''^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=999'', ''^[0-9]+$''); + } else { + var items = Scholar.Utilities.getItemArray(doc, doc, ''^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=999'', ''^[0-9]+$''); // ugly hack to see if we have any items var haveItems = false; - for(i in items) { + for(var i in items) { haveItems = true; break; } // If we don''t have any items otherwise, let us use the numbers if(!haveItems) { - var items = utilities.getItemArray(doc, doc, ''^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=999''); + var items = Scholar.Utilities.getItemArray(doc, doc, ''^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=999''); } - items = utilities.selectItems(items); + items = Scholar.selectItems(items); if(!items) { return true; } - for(i in items) { + for(var i in items) { newUris.push(i.replace("&format=999", "&format=001")); } -} - -utilities.processDocuments(browser, null, newUris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; - var uri = newDoc.location.href; - - var namespace = newDoc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; + } - var xpath = ''/html/body/table/tbody/tr[td[1][@id="bold"]][td[2]]''; - var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - var record = new MARC_Record(); - for(var i=0; i<elmts.length; i++) { - var elmt = elmts[i]; - var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue); - var value = utilities.getNodeString(doc, elmt, ''./TD[2]//text()'', nsResolver); - var value = value.replace(/\|([a-z]) /g, record.subfield_delimiter+"$1"); - - if(field != "FMT" && field != "LDR") { - var ind1 = ""; - var ind2 = ""; - var code = field.substring(0, 3); - if(field.length > 3) { - var ind1 = field.charAt(3); - if(field.length > 4) { - var ind2 = field.charAt(4); + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { + var newDoc = newBrowser.contentDocument; + var uri = newDoc.location.href; + + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var xpath = ''/html/body/table/tbody/tr[td[1][@id="bold"]][td[2]]''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); + + var record = new marc.MARC_Record(); + for(var i=0; i<elmts.length; i++) { + var elmt = elmts[i]; + var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue); + var value = Scholar.Utilities.getNodeString(doc, elmt, ''./TD[2]//text()'', nsResolver); + var value = value.replace(/\|([a-z]) /g, record.subfield_delimiter+"$1"); + + if(field != "FMT" && field != "LDR") { + var ind1 = ""; + var ind2 = ""; + var code = field.substring(0, 3); + if(field.length > 3) { + var ind1 = field.charAt(3); + if(field.length > 4) { + var ind2 = field.charAt(4); + } } + record.add_field(code, ind1, ind2, value); } - record.add_field(code, ind1, ind2, value); } - } - utilities.importMARCRecord(record, uri, model); -}, function() { done(); }, function() {}); - -wait();'); + + var newItem = new Scholar.Item(); + newItem.source = uri; + record.translate(newItem); + newItem.complete(); + }, function() { Scholar.done(); }, function() {}); + + Scholar.wait(); +}'); -REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 3, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)', -'var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); -if(detailsRe.test(doc.location.href)) { - return "book"; -} else { - return "multiple"; +REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 4, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)', +'function detect(doc, url) { + var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); + if(detailsRe.test(doc.location.href)) { + return "book"; + } else { + return "multiple"; + } }', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var uri = doc.location.href; -var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); +'function scrape(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; -var uris = new Array(); -if(detailsRe.test(uri)) { - uris.push(uri+''&fullmarc=true''); -} else { - var items = utilities.getItemArray(doc, doc, "ipac\.jsp\?.*uri=full=[0-9]|^javascript:buildNewList\\(''.*uri%3Dfull%3D[0-9]"); - items = utilities.selectItems(items); - - if(!items) { - return true; - } - - var buildNewList = new RegExp("^javascript:buildNewList\\(''([^'']+)"); + var uri = doc.location.href; + var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); var uris = new Array(); - for(i in items) { - var m = buildNewList.exec(i); - if(m) { - uris.push(unescape(m[1]+''&fullmarc=true'')); - } else { - uris.push(i+''&fullmarc=true''); + if(detailsRe.test(uri)) { + uris.push(uri+''&fullmarc=true''); + } else { + var items = Scholar.Utilities.getItemArray(doc, doc, "ipac\.jsp\?.*uri=full=[0-9]|^javascript:buildNewList\\(''.*uri%3Dfull%3D[0-9]"); + items = Scholar.selectItems(items); + + if(!items) { + return true; } - } -} - -utilities.processDocuments(browser, null, uris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; - var uri = newDoc.location.href; - - var namespace = newDoc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - - var xpath = ''//form/table[@class="tableBackground"]/tbody/tr/td/table[@class="tableBackground"]/tbody/tr[td[1]/a[@class="normalBlackFont1"]]''; - var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - var record = new MARC_Record(); - for(var i=0; i<elmts.length; i++) { - var elmt = elmts[i]; - var field = utilities.superCleanString(utilities.getNode(newDoc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver).nodeValue); - var value = utilities.getNodeString(newDoc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver); - value = value.replace(/\$([a-z]) /g, record.subfield_delimiter+"$1"); - - if(field != "FMT" && field != "LDR") { - var ind1 = ""; - var ind2 = ""; - var valRegexp = /^([0-9])([0-9])? (.*)$/; - var m = valRegexp.exec(value); + + var buildNewList = new RegExp("^javascript:buildNewList\\(''([^'']+)"); + + var uris = new Array(); + for(var i in items) { + var m = buildNewList.exec(i); if(m) { - ind1 = m[1]; - if(ind2) { - ind2 = m[2] - } - value = m[3]; + uris.push(unescape(m[1]+''&fullmarc=true'')); + } else { + uris.push(i+''&fullmarc=true''); } - record.add_field(field, ind1, ind2, value); } } - utilities.importMARCRecord(record, uri, model); -}, function() { done() }, function() {}); - -wait();'); - -REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-26 16:01:00', 3, 'VTLS Scraper', 'Simon Kornblith', '/chameleon(?:\?|$)', -'var node = utilities.getNode(doc, doc, ''//a[text()="marc"]'', null); -if(node) { - return "book"; -} -var node = utilities.getNode(doc, doc, ''//tr[@class="intrRow"]/td/table/tbody/tr[th]'', null); -if(node) { - return "multiple"; -} -return false;', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -var uri = doc.location.href; -var newUris = new Array(); - -var marcs = utilities.gatherElementsOnXPath(doc, doc, ''//a[text()="marc"]'', nsResolver); - -if(marcs.length == 1) { - newUris.push(marcs[0].href) -} else { - // Require link to match this - var tagRegexp = new RegExp(); - tagRegexp.compile("/chameleon\?.*function=CARDSCR"); - - var items = new Array(); - - var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''//tr[@class="intrRow"]'', nsResolver); - // Go through table rows - for(var i=0; i<tableRows.length; i++) { - var links = utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); - // Go through links - var url; - for(var j=0; j<links.length; j++) { - if(tagRegexp.test(links[j].href)) { - url = links[j].href; - break; - } - } - if(url) { - // Collect title information - var fields = utilities.gatherElementsOnXPath(doc, tableRows[i], ''./td/table/tbody/tr[th]'', nsResolver); - for(var j=0; j<fields.length; j++) { - var field = utilities.getNode(doc, fields[j], ''./th/text()'', nsResolver); - if(field.nodeValue == "Title") { - var value = utilities.getNodeString(doc, fields[j], ''./td//text()'', nsResolver); - if(value) { - items[url] = utilities.cleanString(value); + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + + Scholar.Utilities.processDocuments(null, uris, function(newBrowser) { + var newDoc = newBrowser.contentDocument; + var uri = newDoc.location.href; + + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var xpath = ''//form/table[@class="tableBackground"]/tbody/tr/td/table[@class="tableBackground"]/tbody/tr[td[1]/a[@class="normalBlackFont1"]]''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); + + var record = new marc.MARC_Record(); + for(var i=0; i<elmts.length; i++) { + var elmt = elmts[i]; + var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(newDoc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver).nodeValue); + var value = Scholar.Utilities.getNodeString(newDoc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver); + value = value.replace(/\$([a-z]) /g, record.subfield_delimiter+"$1"); + + if(field != "FMT" && field != "LDR") { + var ind1 = ""; + var ind2 = ""; + var valRegexp = /^([0-9])([0-9])? (.*)$/; + var m = valRegexp.exec(value); + if(m) { + ind1 = m[1]; + if(ind2) { + ind2 = m[2] } + value = m[3]; } + marc.add_field(field, ind1, ind2, value); } } - } - - items = utilities.selectItems(items); - - if(!items) { - return true; - } - - for(i in items) { - utilities.debugPrint(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); - newUris.push(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); - } -} - -utilities.processDocuments(browser, null, newUris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; - var uri = newDoc.location.href - - var namespace = newDoc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - - var xpath = ''//table[@class="outertable"]/tbody/tr[td[4]]''; - var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - var record = new MARC_Record(); - for(var i=0; i<elmts.length; i++) { - var elmt = elmts[i]; - var field = utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue; - var ind1 = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver).nodeValue; - var ind2 = utilities.getNode(doc, elmt, ''./TD[3]/text()[1]'', nsResolver).nodeValue; - var value = utilities.getNode(doc, elmt, ''./TD[4]/text()[1]'', nsResolver).nodeValue; - value = value.replace(/\\([a-z]) /g, record.subfield_delimiter+"$1"); - record.add_field(field, ind1, ind2, value); - } + var newItem = new Scholar.Item(); + newItem.source = uri; + record.translate(newItem); + newItem.complete(); + }, function() { Scholar.done() }, function() {}); - utilities.importMARCRecord(record, uri, model); -}, function(){ done(); }, function() {}); - -wait();'); + Scholar.wait(); +}'); -REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-26 16:01:00', 3, 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)', -'if(doc.location.href.indexOf("/authority_hits") > 0) { - return "multiple"; -} else { - return "book"; +REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-26 16:01:00', 4, 'VTLS Scraper', 'Simon Kornblith', '/chameleon(?:\?|$)', +'function detect(doc, url) { + var node = Scholar.Utilities.getNode(doc, doc, ''//tr[@class="intrRow"]/td/table/tbody/tr[th]'', null); + if(node) { + return "multiple"; + } + var node = Scholar.Utilities.getNode(doc, doc, ''//a[text()="marc"]'', null); + if(node) { + return "book"; + } }', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var checkItems = false; - -if(doc.location.href.indexOf("/authority_hits") > 0) { +'function doWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; - checkItems = utilities.gatherElementsOnXPath(doc, doc, "/html/body//ol/li", nsResolver); -} - -if(checkItems && checkItems.length) { - var items = utilities.getItemArray(doc, checkItems, ''https?://.*/web2/tramp2\.exe/see_record''); - items = utilities.selectItems(items); + var uri = doc.location.href; + var newUris = new Array(); - if(!items) { - return true; - } + var marcs = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//a[text()="marc"]'', nsResolver); - var uris = new Array(); - for(i in items) { - uris.push(i); - } -} else { - var uris = new Array(doc.location.href); -} - -for(i in uris) { - var uri = uris[i]; - var uriRegexp = /^(https?:\/\/.*\/web2\/tramp2\.exe\/)(?:goto|see\_record|authority\_hits)(\/.*)\?(?:screen=Record\.html\&)?(.*)$/i; - var m = uriRegexp.exec(uri); - if(uri.indexOf("/authority_hits") < 0) { - var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc&"+m[3]; + if(marcs.length == 1) { + newUris.push(marcs[0].href) } else { - var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc"; + // Require link to match this + var tagRegexp = new RegExp(); + tagRegexp.compile("/chameleon\?.*function=CARDSCR"); + + var items = new Array(); + + var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//tr[@class="intrRow"]'', nsResolver); + // Go through table rows + for(var i=0; i<tableRows.length; i++) { + var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); + // Go through links + var url; + for(var j=0; j<links.length; j++) { + if(tagRegexp.test(links[j].href)) { + url = links[j].href; + break; + } + } + if(url) { + // Collect title information + var fields = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''./td/table/tbody/tr[th]'', nsResolver); + for(var j=0; j<fields.length; j++) { + var field = Scholar.Utilities.getNode(doc, fields[j], ''./th/text()'', nsResolver); + if(field.nodeValue == "Title") { + var value = Scholar.Utilities.getNodeString(doc, fields[j], ''./td//text()'', nsResolver); + if(value) { + items[url] = Scholar.Utilities.cleanString(value); + } + } + } + } + } + + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + for(var i in items) { + Scholar.Utilities.debugPrint(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); + newUris.push(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); + } } - // Keep track of how many requests have been completed - var j = 0; + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - utilities.HTTPUtilities.doGet(newUri, null, function(text) { - var record = new MARC_Record(); - record.load(text, "binary"); - utilities.importMARCRecord(record, uris[j], model); - j++; - if(j == uris.length) { - done(); + Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { + var newDoc = newBrowser.contentDocument; + var uri = newDoc.location.href + + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var xpath = ''//table[@class="outertable"]/tbody/tr[td[4]]''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); + var record = new marc.MARC_Record(); + for(var i=0; i<elmts.length; i++) { + var elmt = elmts[i]; + var field = Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue; + var ind1 = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver).nodeValue; + var ind2 = Scholar.Utilities.getNode(doc, elmt, ''./TD[3]/text()[1]'', nsResolver).nodeValue; + var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[4]/text()[1]'', nsResolver).nodeValue; + value = value.replace(/\\([a-z]) /g, record.subfield_delimiter+"$1"); + + record.add_field(field, ind1, ind2, value); } - }); -} -wait();'); - + + var newItem = new Scholar.Item(); + newItem.source = uri; + record.translate(newItem); + newItem.complete(); + }, function(){ Scholar.done(); }, function() {}); + + Scholar.wait(); +}'); -REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 3, 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', -'if(doc.location.href.indexOf("/GeacQUERY") > 0) { - return "multiple"; -} else { - return "book"; +REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-26 16:01:00', 4, 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)', +'function detect(doc, url) { + if(doc.location.href.indexOf("/authority_hits") > 0) { + return "multiple"; + } else { + return "book"; + } }', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var uri = doc.location.href; - -var uris = new Array(); - -if(uri.indexOf("/GeacQUERY") > 0) { - var items = utilities.getItemArray(doc, doc, "(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html)"); - items = utilities.selectItems(items); +'function doWeb(doc, url) { + var checkItems = false; - if(!items) { - return true; + if(doc.location.href.indexOf("/authority_hits") > 0) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + checkItems = Scholar.Utilities.gatherElementsOnXPath(doc, doc, "/html/body//ol/li", nsResolver); } - var uris = new Array(); - for(i in items) { - var newUri = i.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html"); - newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html"); - uris.push(newUri); + if(checkItems && checkItems.length) { + var items = Scholar.Utilities.getItemArray(doc, checkItems, ''https?://.*/web2/tramp2\.exe/see_record''); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(var i in items) { + uris.push(i); + } + } else { + var uris = new Array(doc.location.href); } -} else { - var newUri = uri.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html"); - newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html"); - uris.push(newUri); -} - -utilities.processDocuments(browser, null, uris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; - var uri = newDoc.location.href; - var namespace = newDoc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - - var record = new MARC_Record(); - - var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, ''//pre/text()'', nsResolver); - var tag, ind1, ind2, content; - - for(var i=0; i<elmts.length; i++) { - var line = elmts[i].nodeValue; - - if(line.substring(0, 6) == " ") { - content += " "+line.substring(6); - continue; + for(var i in uris) { + var uri = uris[i]; + var uriRegexp = /^(https?:\/\/.*\/web2\/tramp2\.exe\/)(?:goto|see\_record|authority\_hits)(\/.*)\?(?:screen=Record\.html\&)?(.*)$/i; + var m = uriRegexp.exec(uri); + if(uri.indexOf("/authority_hits") < 0) { + var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc&"+m[3]; } else { - if(tag) { - record.add_field(tag, ind1, ind2, content); - } + var newUri = m[1]+"download_record"+m[2]+"/RECORD.MRC?format=marc"; } - line = line.replace(/\xA0/g," "); // nbsp - line = line.replace(/_/g," "); - line = line.replace(/\t/g,""); + // Keep track of how many requests have been completed + var j = 0; - tag = line.substring(0, 3); - if(parseInt(tag) > 10) { - ind1 = line.substring(4, 5); - ind2 = line.substring(5, 6); - content = line.substring(7); - content = content.replace(/\$([a-z])(?: |$)/g, record.subfield_delimiter+"$1"); - } else { - ind1 = ""; - ind2 = ""; - content = line.substring(4); - } + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + Scholar.Utilities.HTTPUtilities.doGet(newUri, null, function(text) { + var record = new marc.MARC_Record(); + record.load(text, "binary"); + + var newItem = new Scholar.Item(); + newItem.source = uris[j]; + record.translate(record, newItem); + newItem.complete(); + + j++; + if(j == uris.length) { + Scholar.done(); + } + }); } - - utilities.importMARCRecord(record, uri, model); -}, function() { done(); }, function() {}); - -wait();'); + Scholar.wait(); +}'); -REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 3, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', -'var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; -var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p/text()[1]'', nsResolver); -for(i in elmts) { - if(utilities.superCleanString(elmts[i].nodeValue) == "Viewing record") { +REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', +'function detect(doc, url) { + if(doc.location.href.indexOf("/GeacQUERY") > 0) { + return "multiple"; + } else { return "book"; } -} -var xpath = ''//form[@name="hitlist"]/table/tbody/tr''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -if(elmts.length) { - return "multiple"; -} -return false;', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -// Cheap hack to convert HTML entities -function unescapeHTML(text) { - var div = doc.createElement("div"); - div.innerHTML = utilities.cleanTags(text); - var text = div.childNodes[0] ? div.childNodes[0].nodeValue : null; - delete div; - return text; -} - -var uri = doc.location.href; -var recNumbers = new Array(); - -var xpath = ''//form[@name="hitlist"]/table/tbody/tr''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -if(elmts.length) { // Search results page - var uriRegexp = /^http:\/\/[^\/]+/; - var m = uriRegexp.exec(uri); - var postAction = doc.forms.namedItem("hitlist").getAttribute("action"); - var newUri = m[0]+postAction.substr(0, postAction.length-1)+"40" - - var titleRe = /<br>\s*(.*[^\s])\s*<br>/i; +}', +'function doWeb(doc, url) { + var uri = doc.location.href; - var items = new Array(); + var uris = new Array(); - for(var i=0; i<elmts.length; i++) { - var links = utilities.gatherElementsOnXPath(doc, elmts[i], ''.//a'', nsResolver); + if(uri.indexOf("/GeacQUERY") > 0) { + var items = Scholar.Utilities.getItemArray(doc, doc, "(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html)"); + items = Scholar.selectItems(items); - // Collect title - var myTd = utilities.getNode(doc, elmts[i], "./td[2]", nsResolver); - var m = titleRe.exec(myTd.innerHTML); - var title = unescapeHTML(m[1]); + if(!items) { + return true; + } - items[i] = title; - } - - - items = utilities.selectItems(items); - - if(!items) { - return true; - } - - for(i in items) { - recNumbers.push(parseInt(i)+1); - } -} else { // Normal page - var uriRegexp = /^(.*)(\/[0-9]+)$/; - var m = uriRegexp.exec(uri); - var newUri = m[1]+"/40" - - var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver); - for(i in elmts) { - var elmt = elmts[i]; - var initialText = utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver); - if(initialText && initialText.nodeValue && utilities.superCleanString(initialText.nodeValue) == "Viewing record") { - recNumbers.push(utilities.getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue); - break; + var uris = new Array(); + for(var i in items) { + var newUri = i.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html"); + newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html"); + uris.push(newUri); } + } else { + var newUri = uri.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html"); + newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html"); + uris.push(newUri); } -} - -utilities.HTTPUtilities.doGet(newUri+''?marks=''+recNumbers.join(",")+''&shadow=NO&format=FLAT+ASCII&sort=TITLE&vopt_elst=ALL&library=ALL&display_rule=ASCENDING&duedate_code=l&holdcount_code=t&DOWNLOAD_x=22&DOWNLOAD_y=12&address=&form_type='', null, function(text) { - var texts = text.split("<PRE>"); - texts = texts[1].split("</PRE>"); - text = unescapeHTML(texts[0]); - var documents = text.split("*** DOCUMENT BOUNDARY ***"); - for(var j=1; j<documents.length; j++) { - var uri = newUri+"?marks="+recNumbers[j]+"&shadow=NO&format=FLAT+ASCII&sort=TITLE&vopt_elst=ALL&library=ALL&display_rule=ASCENDING&duedate_code=l&holdcount_code=t&DOWNLOAD_x=22&DOWNLOAD_y=12&address=&form_type="; - var lines = documents[j].split("\n"); - var record = new MARC_Record(); + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); + + Scholar.Utilities.processDocuments(null, uris, function(newBrowser) { + var newDoc = newBrowser.contentDocument; + var uri = newDoc.location.href; + + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var record = new marc.MARC_Record(); + + var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, ''//pre/text()'', nsResolver); var tag, ind1, ind2, content; - for(var i=0; i<lines.length; i++) { - var line = lines[i]; + + for(var i=0; i<elmts.length; i++) { + var line = elmts[i].nodeValue; - if(line.substr(0, 1) == "." && line.substr(4,2) == ". ") { + if(line.substring(0, 6) == " ") { + content += " "+line.substring(6); + continue; + } else { if(tag) { - content = content.replace(/\|([a-z])/g, record.subfield_delimiter+"$1"); record.add_field(tag, ind1, ind2, content); } - } else { - content += " "+line.substring(6); - continue; } - tag = line.substr(1, 3); + line = line.replace(/\xA0/g," "); // nbsp + line = line.replace(/_/g," "); + line = line.replace(/\t/g,""); + tag = line.substring(0, 3); if(parseInt(tag) > 10) { - ind1 = line.substr(6, 1); - ind2 = line.substr(7, 1); - content = line.substr(8); + ind1 = line.substring(4, 5); + ind2 = line.substring(5, 6); + content = line.substring(7); + content = content.replace(/\$([a-z])(?: |$)/g, record.subfield_delimiter+"$1"); } else { ind1 = ""; ind2 = ""; - content = line.substring(6); + content = line.substring(4); } + } - utilities.importMARCRecord(record, uri, model); - } - done(); -}); - -wait();'); + + var newItem = new Scholar.Item(); + newItem.source = uri; + record.translate(newItem); + newItem.complete(); + }, function() { Scholar.done(); }, function() {}); + + Scholar.wait(); +}'); -REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 3, 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)', -'var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); -if(detailRe.test(doc.location.href)) { - return "book"; -} else { - return "multiple"; -}', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); -var uri = doc.location.href; -var newUris = new Array(); - -if(detailRe.test(uri)) { - newUris.push(uri.replace("LabelDisplay", "MARCDisplay")); -} else { - var items = utilities.getItemArray(doc, doc, ''TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]''); - items = utilities.selectItems(items); +REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 4, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', +'function detect(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; - if(!items) { - return true; + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p/text()[1]'', nsResolver); + for(var i=0; i<elmts.length; i++) { + if(Scholar.Utilities.superCleanString(elmts[i].nodeValue) == "Viewing record") { + return "book"; + } } - - for(i in items) { - newUris.push(i.replace("LabelDisplay", "MARCDisplay")); + var xpath = ''//form[@name="hitlist"]/table/tbody/tr''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + if(elmts.length) { + return "multiple"; } -} - -utilities.processDocuments(browser, null, newUris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; - var uri = newDoc.location.href; - - var namespace = newDoc.documentElement.namespaceURI; +}', +'function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; + if (prefix == ''x'') return namespace; else return null; } : null; - var record = new MARC_Record(); + // Cheap hack to convert HTML entities + function unescapeHTML(text) { + var div = doc.createElement("div"); + div.innerHTML = Scholar.Utilities.cleanTags(text); + var text = div.childNodes[0] ? div.childNodes[0].nodeValue : null; + delete div; + return text; + } - var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, ''/html/body/table/tbody/tr[td[4]]'', nsResolver); - var tag, ind1, ind2, content; + var uri = doc.location.href; + var recNumbers = new Array(); - for(var i=0; i<elmts.length; i++) { - var elmt = elmts[i]; + var xpath = ''//form[@name="hitlist"]/table/tbody/tr''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + if(elmts.length) { // Search results page + var uriRegexp = /^http:\/\/[^\/]+/; + var m = uriRegexp.exec(uri); + var postAction = doc.forms.namedItem("hitlist").getAttribute("action"); + var newUri = m[0]+postAction.substr(0, postAction.length-1)+"40" - tag = utilities.getNode(newDoc, elmt, ''./td[2]/tt[1]/text()[1]'', nsResolver).nodeValue; - var inds = utilities.getNode(newDoc, elmt, ''./td[3]/tt[1]/text()[1]'', nsResolver).nodeValue; + var titleRe = /<br>\s*(.*[^\s])\s*<br>/i; - tag = tag.replace(/[\r\n]/g, ""); - if(tag.length == 1) { - tag = "00"+tag; - } else if(tag.length == 2) { - tag = "0"+tag; + var items = new Array(); + + for(var i=0; i<elmts.length; i++) { + var links = Scholar.Utilities.gatherElementsOnXPath(doc, elmts[i], ''.//a'', nsResolver); + + // Collect title + var myTd = Scholar.Utilities.getNode(doc, elmts[i], "./td[2]", nsResolver); + var m = titleRe.exec(myTd.innerHTML); + var title = unescapeHTML(m[1]); + + items[i] = title; } - inds = inds.replace(/[\r\n]/g, ""); - // Get indicators, fix possible problems with &nbsp;s - ind1 = inds.substr(0, 1); - ind2 = inds.substr(1, 1); - if(ind1 == "\xA0") { - ind1 = ""; + + items = Scholar.selectItems(items); + + if(!items) { + return true; } - if(ind2 == "\xA0") { - ind2 = ""; + + for(var i in items) { + recNumbers.push(parseInt(i)+1); } + } else { // Normal page + var uriRegexp = /^(.*)(\/[0-9]+)$/; + var m = uriRegexp.exec(uri); + var newUri = m[1]+"/40" - var children = utilities.gatherElementsOnXPath(newDoc, elmt, ''./td[4]/tt[1]//text()'', nsResolver); - content = ""; - if(children.length == 1) { - content = children[0].nodeValue; - } else { - for(var j=0; j<children.length; j+=2) { - var subfield = children[j].nodeValue.substr(1, 1); - var fieldContent = children[j+1].nodeValue; - content += record.subfield_delimiter+subfield+fieldContent; - } - } - - record.add_field(tag, ind1, ind2, content); - } - - utilities.importMARCRecord(record, uri, model); -}, function() {done(); }, function() {}); - -wait();'); - -REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-26 16:01:00', 3, 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)', -'var searchRe = new RegExp("^http://[^/]+/search/pia\.cgi"); -if(searchRe.test(doc.location.href)) { - return "multiple"; -} else { - return "journalArticle"; -}', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var namespace = doc.documentElement.namespaceURI; -var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; -} : null; - -function newDataObject() { - var data = new Object(); - data[prefixDC + "title"] = new Array(); - data[prefixDC + "creator"] = new Array(); - data[prefixDummy + "publication"] = new Array(); - data[prefixDummy + "volume"] = new Array(); - data[prefixDummy + "number"] = new Array(); - data[prefixDummy + "series"] = new Array(); - data[prefixDC + "year"] = new Array(); - data[prefixDummy + "pages"] = new Array(); - data[prefixDC + "identifier"] = new Array(); - data[prefixDC + "publisher"] = new Array(); - return data; -} - -var searchRe = new RegExp("^http://[^/]+/search/pia\.cgi"); -if(searchRe.test(doc.location.href)) { - var items = new Array(); - var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/table[@class="navbar"]/tbody/tr/td/form/table'', nsResolver); - // Go through table rows - for(var i=0; i<tableRows.length; i++) { - // article_id is what we need to get it all as one file - var input = utilities.getNode(doc, tableRows[i], ''./tbody/tr/td/input[@name="article_id"]'', nsResolver); - var link = utilities.getNode(doc, tableRows[i], ''.//b/i/a/text()'', nsResolver); - if(input && input.value && link && link.nodeValue) { - items[input.value] = link.nodeValue; + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver); + for(var i=0; i<elmts.length; i++) { + var elmt = elmts[i]; + var initialText = Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver); + if(initialText && initialText.nodeValue && Scholar.Utilities.superCleanString(initialText.nodeValue) == "Viewing record") { + recNumbers.push(Scholar.Utilities.getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue); + break; + } } } - items = utilities.selectItems(items); - if(!items) { - return true; - } + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - try { - var search_id = doc.forms.namedItem("results").elements.namedItem("search_id").value; - } catch(e) { - var search_id = ""; - } - var articleString = ""; - for(i in items) { - articleString += "&article_id="+i; - } - var savePostString = "actiontype=save&search_id="+search_id+articleString; - - utilities.HTTPUtilities.doGet("http://muse.jhu.edu/search/save.cgi?"+savePostString, null, function() { - utilities.HTTPUtilities.doGet("http://muse.jhu.edu/search/export.cgi?exporttype=endnote"+articleString, null, function(text) { - var records = text.split("\n\n"); - for(i in records) { - var lines = records[i].split("\n"); - if(lines.length > 1) { - var data = newDataObject(); - for(i in lines) { - var fieldCode = lines[i].substring(0, 2); - var fieldContent = utilities.cleanString(lines[i].substring(6)) - - if(fieldCode == "T1") { - data[prefixDC + "title"].push(fieldContent); - } else if(fieldCode == "A1") { - var authors = fieldContent.split(";"); - for(j in authors) { - var author = authors[j]; - if(author) { - var splitNames = author.split('', ''); - if(splitNames) { - author = splitNames[1]+'' ''+splitNames[0]; - } - data[prefixDC + "creator"].push(author); - } - } - } else if(fieldCode == "JF") { - data[prefixDummy + "publication"].push(fieldContent); - } else if(fieldCode == "VL") { - data[prefixDummy + "volume"].push(fieldContent); - } else if(fieldCode == "IS") { - data[prefixDummy + "number"].push(fieldContent); - } else if(fieldCode == "Y1") { - data[prefixDC + "year"].push(fieldContent); - } else if(fieldCode == "PP") { - data[prefixDummy + "pages"].push(fieldContent); - } else if(fieldCode == "UR") { - stableURL = fieldContent; - } else if(fieldCode == "SN") { - data[prefixDC + "identifier"].push("ISSN "+fieldContent); - ISSN = fieldContent; - } else if(fieldCode == "PB") { - data[prefixDC + "publisher"].push(fieldContent); - } - } - model.addStatement(stableURL, prefixRDF + "type", prefixDummy + "journalArticle", false); - for(i in data) { - if(data[i].length) { - for(j in data[i]) { - model.addStatement(stableURL, i, data[i][j]); - } - } + Scholar.Utilities.HTTPUtilities.doGet(newUri+''?marks=''+recNumbers.join(",")+''&shadow=NO&format=FLAT+ASCII&sort=TITLE&vopt_elst=ALL&library=ALL&display_rule=ASCENDING&duedate_code=l&holdcount_code=t&DOWNLOAD_x=22&DOWNLOAD_y=12&address=&form_type='', null, function(text) { + var texts = text.split("<PRE>"); + texts = texts[1].split("</PRE>"); + text = unescapeHTML(texts[0]); + var documents = text.split("*** DOCUMENT BOUNDARY ***"); + + for(var j=1; j<documents.length; j++) { + var uri = newUri+"?marks="+recNumbers[j]+"&shadow=NO&format=FLAT+ASCII&sort=TITLE&vopt_elst=ALL&library=ALL&display_rule=ASCENDING&duedate_code=l&holdcount_code=t&DOWNLOAD_x=22&DOWNLOAD_y=12&address=&form_type="; + var lines = documents[j].split("\n"); + var record = new marc.MARC_Record(); + var tag, ind1, ind2, content; + for(var i=0; i<lines.length; i++) { + var line = lines[i]; + + if(line.substr(0, 1) == "." && line.substr(4,2) == ". ") { + if(tag) { + content = content.replace(/\|([a-z])/g, record.subfield_delimiter+"$1"); + record.add_field(tag, ind1, ind2, content); } + } else { + content += " "+line.substring(6); + continue; + } + + tag = line.substr(1, 3); + + if(parseInt(tag) > 10) { + ind1 = line.substr(6, 1); + ind2 = line.substr(7, 1); + content = line.substr(8); + } else { + ind1 = ""; + ind2 = ""; + content = line.substring(6); } } - done(); - }, function() {}); - }, function() {}); - - wait(); -} else { - var uri = doc.location.href; - - var elmts = utilities.gatherElementsOnXPath(doc, doc, ''//comment()'', nsResolver); - for(i in elmts) { - if(elmts[i].nodeValue.substr(0, 10) == "HeaderData") { - var headerRegexp = /HeaderData((?:.|\n)*)\#\#EndHeaders/i - var m = headerRegexp.exec(elmts[i].nodeValue); - var headerData = m[1]; + + var newItem = new Scholar.Item(); + newItem.source = uri; + record.translate(newItem); + newItem.complete(); } + Scholar.done(); + }); + + Scholar.wait(); +}'); + +REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 4, 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)', +'function detect(doc, url) { + var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); + if(detailRe.test(doc.location.href)) { + return "book"; + } else { + return "multiple"; } +}', +'function doWeb(doc, url) { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; - // Use E4X rather than DOM/XPath, because the Mozilla gods have decided not to - // expose DOM/XPath to sandboxed scripts - var newDOM = new XML(headerData); + var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); + var uri = doc.location.href; + var newUris = new Array(); - function mapRDF(text, rdfUri) { - if(text) { - model.addStatement(uri, rdfUri, text, true); + if(detailRe.test(uri)) { + newUris.push(uri.replace("LabelDisplay", "MARCDisplay")); + } else { + var items = Scholar.Utilities.getItemArray(doc, doc, ''TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]''); + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + for(var i in items) { + newUris.push(i.replace("LabelDisplay", "MARCDisplay")); } } - mapRDF(newDOM.journal.text(), prefixDummy + "publication"); - mapRDF(newDOM.volume.text(), prefixDummy + "volume"); - mapRDF(newDOM.issue.text(), prefixDummy + "number"); - mapRDF(newDOM.year.text(), prefixDummy + "year"); - mapRDF(newDOM.pubdate.text(), prefixDC + "date"); - mapRDF(newDOM.doctitle.text(), prefixDC + "title"); + var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - // Do ISSN - var issn = newDOM.issn.text(); - if(issn) { - model.addStatement(uri, prefixDC + "identifier", "ISSN "+issn.replace(/[^0-9]/g, ""), true); - } - - // Do pages - var fpage = newDOM.fpage.text(); - var lpage = newDOM.lpage.text(); - if(fpage != "") { - var pages = fpage; - if(lpage) { - pages += "-"+lpage; + Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { + var newDoc = newBrowser.contentDocument; + var uri = newDoc.location.href; + + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var record = new marc.MARC_Record(); + + var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, ''/html/body/table/tbody/tr[td[4]]'', nsResolver); + var tag, ind1, ind2, content; + + for(var i=0; i<elmts.length; i++) { + var elmt = elmts[i]; + + tag = Scholar.Utilities.getNode(newDoc, elmt, ''./td[2]/tt[1]/text()[1]'', nsResolver).nodeValue; + var inds = Scholar.Utilities.getNode(newDoc, elmt, ''./td[3]/tt[1]/text()[1]'', nsResolver).nodeValue; + + tag = tag.replace(/[\r\n]/g, ""); + if(tag.length == 1) { + tag = "00"+tag; + } else if(tag.length == 2) { + tag = "0"+tag; + } + inds = inds.replace(/[\r\n]/g, ""); + + // Get indicators, fix possible problems with &nbsp;s + ind1 = inds.substr(0, 1); + ind2 = inds.substr(1, 1); + if(ind1 == "\xA0") { + ind1 = ""; + } + if(ind2 == "\xA0") { + ind2 = ""; + } + + var children = Scholar.Utilities.gatherElementsOnXPath(newDoc, elmt, ''./td[4]/tt[1]//text()'', nsResolver); + content = ""; + if(children.length == 1) { + content = children[0].nodeValue; + } else { + for(var j=0; j<children.length; j+=2) { + var subfield = children[j].nodeValue.substr(1, 1); + var fieldContent = children[j+1].nodeValue; + content += record.subfield_delimiter+subfield+fieldContent; + } + } + + record.add_field(tag, ind1, ind2, content); } - model.addStatement(uri, prefixDummy + "pages", pages, true); - } - - // Do authors - var elmts = newDOM.docauthor; - for(i in elmts) { - var fname = elmts[i].fname.text(); - var surname = elmts[i].surname.text(); - model.addStatement(uri, prefixDC + "creator", fname+" "+surname, true); - } + + var newItem = new Scholar.Item(); + newItem.source = uri; + record.translate(newItem); + newItem.complete(); + }, function() {Scholar.done(); }, function() {}); - model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); + Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-26 16:01:00', 3, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', -'if(doc.location.href.indexOf("list_uids=") >= 0) { - return "journalArticle"; -} else { - return "multiple"; -}', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -function mapRDF(uri, text, rdfUri) { - if(text != "") { - model.addStatement(uri, rdfUri, text, true); +REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-26 16:01:00', 4, 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)', +'function detect(doc, url) { + var searchRe = new RegExp("^http://[^/]+/search/pia\.cgi"); + if(searchRe.test(url)) { + return "multiple"; + } else { + return "journalArticle"; } -} - -var uri = doc.location.href; -var ids = new Array(); -var idRegexp = /[\?\&]list_uids=([0-9\,]+)/; - -var m = idRegexp.exec(uri); -if(m) { - ids.push(m[1]); -} else { +}', +'function doWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; - var items = new Array(); - var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''//div[@class="ResultSet"]/table/tbody'', nsResolver); - // Go through table rows - for(var i=0; i<tableRows.length; i++) { - var link = utilities.getNode(doc, tableRows[i], ''.//a'', nsResolver); - var article = utilities.getNode(doc, tableRows[i], ''./tr[2]/td[2]/text()[1]'', nsResolver); - items[link.href] = article.nodeValue; + var searchRe = new RegExp("^http://[^/]+/search/pia\.cgi"); + if(searchRe.test(doc.location.href)) { + var items = new Array(); + var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/table[@class="navbar"]/tbody/tr/td/form/table'', nsResolver); + // Go through table rows + for(var i=0; i<tableRows.length; i++) { + // article_id is what we need to get it all as one file + var input = Scholar.Utilities.getNode(doc, tableRows[i], ''./tbody/tr/td/input[@name="article_id"]'', nsResolver); + var link = Scholar.Utilities.getNode(doc, tableRows[i], ''.//b/i/a/text()'', nsResolver); + if(input && input.value && link && link.nodeValue) { + items[input.value] = link.nodeValue; + } + } + + items = Scholar.selectItems(items); + if(!items) { + return true; + } + + try { + var search_id = doc.forms.namedItem("results").elements.namedItem("search_id").value; + } catch(e) { + var search_id = ""; + } + var articleString = ""; + for(var i in items) { + articleString += "&article_id="+i; + } + var savePostString = "actiontype=save&search_id="+search_id+articleString; + + Scholar.Utilities.HTTPUtilities.doGet("http://muse.jhu.edu/search/save.cgi?"+savePostString, null, function() { + Scholar.Utilities.HTTPUtilities.doGet("http://muse.jhu.edu/search/export.cgi?exporttype=endnote"+articleString, null, function(text) { + // load translator for RIS + var translator = Scholar.loadTranslator("import", "32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7"); + // feed in data + translator.Scholar.write(text); + translator.Scholar.eof(); + // translate + translator.doImport(); + Scholar.done(); + }, function() {}); + }, function() {}); + + Scholar.wait(); + } else { + var newItem = new Scholar.Item("journalArticle"); + newItem.source = url; + + var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//comment()'', nsResolver); + for(var i in elmts) { + if(elmts[i].nodeValue.substr(0, 10) == "HeaderData") { + var headerRegexp = /HeaderData((?:.|\n)*)\#\#EndHeaders/i + var m = headerRegexp.exec(elmts[i].nodeValue); + var headerData = m[1]; + } + } + + // Use E4X rather than DOM/XPath, because the Mozilla gods have decided not to + // expose DOM/XPath to sandboxed scripts + var newDOM = new XML(headerData); + + function mapRDF(text, rdfUri) { + if(text) { + model.addStatement(uri, rdfUri, text, true); + } + } + + newItem.publication = newDOM.journal.text(); + newItem.volume = newDOM.volume.text(); + newItem.number = newDOM.issue.text(); + newItem.year = newDOM.year.text(); + newItem.date = newDOM.pubdate.text(); + newItem.title = newDOM.doctitle.text(); + newItem.ISSN = newDOM.issn.text(); + + // Do pages + var fpage = newDOM.fpage.text(); + var lpage = newDOM.lpage.text(); + if(fpage != "") { + newItem.pages = fpage; + if(lpage) { + newItem.pages += "-"+lpage; + } + } + + // Do authors + var elmts = newDOM.docauthor; + for(var i in elmts) { + var fname = elmts[i].fname.text(); + var surname = elmts[i].surname.text(); + newItem.creators.push({firstName:fname, lastName:surname, creatorType:"author"}); + } + + newItem.complete(); } - - items = utilities.selectItems(items); - - if(!items) { - return true; +}'); + +REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-26 16:01:00', 4, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', +'function detect(doc, url) { + if(doc.location.href.indexOf("list_uids=") >= 0) { + return "journalArticle"; + } else { + return "multiple"; } +}', +'function doWeb(doc, url) { + var uri = doc.location.href; + var ids = new Array(); + var idRegexp = /[\?\&]list_uids=([0-9\,]+)/; - for(i in items) { - var m = idRegexp.exec(i); + var m = idRegexp.exec(uri); + if(m) { ids.push(m[1]); - } -} - -var newUri = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=PubMed&retmode=xml&rettype=citation&id="+ids.join(","); -utilities.HTTPUtilities.doGet(newUri, null, function(text) { - // Remove xml parse instruction and doctype - text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, ""); - - var xml = new XML(text); - - for(var i=0; i<xml.PubmedArticle.length(); i++) { - var citation = xml.PubmedArticle[i].MedlineCitation; + } else { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; - var uri = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=pubmed&cmd=Retrieve&list_uids="+citation.PMID.text(); - if(citation.PMID.length()) { - model.addStatement(uri, prefixDC + "identifier", "PMID "+citation.PMID.text(), true); + var items = new Array(); + var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//div[@class="ResultSet"]/table/tbody'', nsResolver); + // Go through table rows + for(var i=0; i<tableRows.length; i++) { + var link = Scholar.Utilities.getNode(doc, tableRows[i], ''.//a'', nsResolver); + var article = Scholar.Utilities.getNode(doc, tableRows[i], ''./tr[2]/td[2]/text()[1]'', nsResolver); + items[link.href] = article.nodeValue; } - var article = citation.Article; - if(article.ArticleTitle.length()) { - var title = article.ArticleTitle.text().toString(); - if(title.substr(-1) == ".") { - title = title.substring(0, title.length-1); - } - model.addStatement(uri, prefixDC + "title", title, true); + items = Scholar.selectItems(items); + + if(!items) { + return true; } - if(article.Journal.length()) { - var issn = article.Journal.ISSN.text(); - if(issn) { - model.addStatement(uri, prefixDC + "identifier", "ISSN "+issn.replace(/[^0-9]/g, ""), true); - } + for(var i in items) { + var m = idRegexp.exec(i); + ids.push(m[1]); + } + } + + var newUri = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=PubMed&retmode=xml&rettype=citation&id="+ids.join(","); + Scholar.Utilities.HTTPUtilities.doGet(newUri, null, function(text) { + // Remove xml parse instruction and doctype + text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, ""); + + var xml = new XML(text); + + for(var i=0; i<xml.PubmedArticle.length(); i++) { + var newItem = new Scholar.Item("journalArticle"); + + var citation = xml.PubmedArticle[i].MedlineCitation; + + newItem.source = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=pubmed&cmd=Retrieve&list_uids="+citation.PMID.text(); + // TODO: store PMID directly - if(article.Journal.Title.length()) { - model.addStatement(uri, prefixDummy + "publication", utilities.superCleanString(article.Journal.Title.text().toString()), true); - } else if(citation.MedlineJournalInfo.MedlineTA.length()) { - model.addStatement(uri, prefixDummy + "publication", utilities.superCleanString(citation.MedlineJournalInfo.MedlineTA.text().toString()), true); + var article = citation.Article; + if(article.ArticleTitle.length()) { + var title = article.ArticleTitle.text().toString(); + if(title.substr(-1) == ".") { + title = title.substring(0, title.length-1); + } + newItem.title = title; } - if(article.Journal.JournalIssue.length()) { - mapRDF(uri, article.Journal.JournalIssue.Volume.text(), prefixDummy + "volume"); - mapRDF(uri, article.Journal.JournalIssue.Issue.text(), prefixDummy + "number"); - if(article.Journal.JournalIssue.PubDate.length()) { - if(article.Journal.JournalIssue.PubDate.Day.text().toString() != "") { - var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Day.text()+", "+article.Journal.JournalIssue.PubDate.Year.text(); - var jsDate = new Date(date); - if(!isNaN(jsDate.valueOf())) { - date = utilities.dateToISO(date); + if(article.Journal.length()) { + var issn = article.Journal.ISSN.text(); + if(issn) { + newItem.ISSN = issn.replace(/[^0-9]/g, ""); + } + + if(article.Journal.Title.length()) { + newItem.publication = Scholar.Utilities.superCleanString(article.Journal.Title.text().toString()); + } else if(citation.MedlineJournalInfo.MedlineTA.length()) { + newItem.publication = Scholar.Utilities.superCleanString(citation.MedlineJournalInfo.MedlineTA.text().toString()); + } + + if(article.Journal.JournalIssue.length()) { + newItem.volume = article.Journal.JournalIssue.Volume.text(); + newItem.number = article.Journal.JournalIssue.Issue.text(); + if(article.Journal.JournalIssue.PubDate.length()) { // try to get the date + if(article.Journal.JournalIssue.PubDate.Day.text().toString() != "") { + var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Day.text()+", "+article.Journal.JournalIssue.PubDate.Year.text(); + var jsDate = new Date(date); + if(!isNaN(jsDate.valueOf())) { + date = Scholar.Utilities.dateToISO(jsDate); + } + } else if(article.Journal.JournalIssue.PubDate.Month.text().toString() != "") { + var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Year.text(); + } else if(article.Journal.JournalIssue.PubDate.Year.text().toString() != "") { + var date = article.Journal.JournalIssue.PubDate.Year.text(); + } + + if(date) { + newItem.date = date; } - } else if(article.Journal.JournalIssue.PubDate.Month.text().toString() != "") { - var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Year.text(); - } else if(article.Journal.JournalIssue.PubDate.Year.text().toString() != "") { - var date = article.Journal.JournalIssue.PubDate.Year.text(); - } - if(date) { - model.addStatement(uri, prefixDC + "date", date, true); } } } - } - - if(article.AuthorList.length() && article.AuthorList.Author.length()) { - var authors = article.AuthorList.Author; - for(var j=0; j<authors.length(); j++) { - var lastName = authors[j].LastName.text().toString(); - var firstName = authors[j].FirstName.text().toString(); - if(firstName == "") { - var firstName = authors[j].ForeName.text().toString(); - } - if(firstName && lastName) { - model.addStatement(uri, prefixDC + "creator", firstName + " " + lastName); + + if(article.AuthorList.length() && article.AuthorList.Author.length()) { + var authors = article.AuthorList.Author; + for(var j=0; j<authors.length(); j++) { + var lastName = authors[j].LastName.text().toString(); + var firstName = authors[j].FirstName.text().toString(); + if(firstName == "") { + var firstName = authors[j].ForeName.text().toString(); + } + if(firstName || lastName) { + newItem.creators.push({lastName:lastName, firstName:firstName}); + } } } + + newItem.complete(); } - model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); - } - - done(); -}) - -wait();'); - -REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:41:00', 3, 'Generic Scraper', 'Simon Kornblith', NULL, -'return "website";', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var uri = doc.location.href; - -// Eventually, we can grab a last modified date from the Last-Modified header, -// but Piggy Bank will never be able to manage that - -var metaTags = doc.getElementsByTagName("meta"); + + Scholar.done(); + }) + + Scholar.wait(); +}'); -var foundCreator = false; // Multiple creators encoded two different ways can screw us up -var foundTitle = false; // Can always figure this out on our own -for(var i=0; i<metaTags.length; i++) { - var tag = metaTags[i].getAttribute("name"); - var value = metaTags[i].getAttribute("content"); - if(tag && value && tag.substr(0, 3).toLowerCase() == "dc.") { - var suffix = tag.substr(3); - if(suffix == "creator" && !foundCreator) { - // Everyone uses different methods of encoding the DC creator; clean them - value = utilities.cleanAuthor(value); - var foundCreator = true; - } - if(suffix == "title") { - foundTitle = true; +REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:41:00', 4, 'Embedded RDF Scraper', 'Simon Kornblith', NULL, +'function detect(doc, url) { + var metaTags = doc.getElementsByTagName("meta"); + + for(var i=0; i<metaTags.length; i++) { + var tag = metaTags[i].getAttribute("name"); + if(tag && tag.substr(0, 3).toLowerCase() == "dc.") { + return "website"; } - model.addStatement(uri, prefixDC + suffix, value, true); - } else if(tag && value && (tag == "author" || tag == "author-personal")) { - value = utilities.cleanAuthor(value); - var foundCreator = true; - model.addStatement(uri, prefixDC + "creator", value, true); - } else if(tag && value && tag == "author-corporate") { - var foundCreator = true; - model.addStatement(uri, prefixDC + "creator", value, true); - } else if(tag && value && tag == "title") { - var foundTitle = true; - model.addStatement(uri, prefixDC + "title", value, true); } -} - -if(!foundTitle) { - model.addStatement(uri, prefixDC + "title", doc.title, true); -} - -model.addStatement(uri, prefixRDF + "type", prefixDummy + "website", false);'); - -REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 3, 'Google Books Scraper', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)', -'var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i''); -if(re.test(doc.location.href)) { - return "book"; -} else { - return "multiple"; + + return false; }', -'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; -var prefixDC = ''http://purl.org/dc/elements/1.1/''; -var prefixDCMI = ''http://purl.org/dc/dcmitype/''; -var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; - -var uri = doc.location.href; -var newUris = new Array(); - -var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i''); -var m = re.exec(uri); -if(m) { - newUris.push(''http://books.google.com/books?vid=''+m[1]+''&id=''+m[2]); -} else { - var items = utilities.getItemArray(doc, doc, ''http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''^(?:All matching pages|About this Book)''); +'function doWeb(doc, url) { + var dc = "http://purl.org/dc/elements/1.1/"; - // Drop " - Page" thing - for(i in items) { - items[i] = items[i].replace(/- Page [0-9]+\s*$/, ""); + // load RDF translator + var translator = Scholar.loadTranslator("import", "5e3ad958-ac79-463d-812b-a86a9235c28f"); + + var metaTags = doc.getElementsByTagName("meta"); + var foundTitle = false; // We can use the page title if necessary + for(var i=0; i<metaTags.length; i++) { + var tag = metaTags[i].getAttribute("name"); + var value = metaTags[i].getAttribute("content"); + if(tag && value && tag.substr(0, 3).toLowerCase() == "dc.") { + if(tag == "dc.title") { + foundTitle = true; + } + translator.Scholar.RDF.addStatement(url, dc + tag.substr(3), value, true); + Scholar.Utilities.debugPrint(tag.substr(3) + " = " + value); + } else if(tag && value && (tag == "author" || tag == "author-personal")) { + translator.Scholar.RDF.addStatement(url, dc + "creator", value, true); + } else if(tag && value && tag == "author-corporate") { + translator.Scholar.RDF.addStatement(url, dc + "creator", value, true); + } } - items = utilities.selectItems(items); - if(!items) { - return true; + if(!foundTitle) { + translator.Scholar.RDF.addStatement(url, dc + "title", doc.title, true); } - for(i in items) { - var m = re.exec(i); - newUris.push(''http://books.google.com/books?vid=''+m[1]+''&id=''+m[2]); + translator.doImport(); +}'); + +REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 4, 'Google Books Scraper', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)', +'function detect(doc, url) { + var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i''); + if(re.test(doc.location.href)) { + return "book"; + } else { + return "multiple"; + } +}', +'function doWeb(doc, url) { + var uri = doc.location.href; + var newUris = new Array(); + + var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i''); + var m = re.exec(uri); + if(m) { + newUris.push(''http://books.google.com/books?vid=''+m[1]+''&id=''+m[2]); + } else { + var items = Scholar.Utilities.getItemArray(doc, doc, ''http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''^(?:All matching pages|About this Book|Table of Contents|Index)''); + + // Drop " - Page" thing + for(var i in items) { + items[i] = items[i].replace(/- Page [0-9]+\s*$/, ""); + } + items = Scholar.selectItems(items); + + if(!items) { + return true; + } + + for(var i in items) { + var m = re.exec(i); + newUris.push(''http://books.google.com/books?vid=''+m[1]+''&id=''+m[2]); + } } -} - -utilities.processDocuments(browser, null, newUris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; - var uri = newDoc.location.href; - var namespace = newDoc.documentElement.namespaceURI; - var nsResolver = namespace ? function(prefix) { - if (prefix == ''x'') return namespace; else return null; - } : null; - - var xpath = ''//table[@id="bib"]/tbody/tr''; - var elmts = utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - for(var i = 0; i<elmts.length; i++) { - var field = utilities.getNode(newDoc, elmts[i], ''./td[1]//text()'', nsResolver); - var value = utilities.getNode(newDoc, elmts[i], ''./td[2]//text()'', nsResolver); - - if(field && value) { - field = utilities.superCleanString(field.nodeValue); - value = utilities.cleanString(value.nodeValue); - if(field == "Title") { - model.addStatement(uri, prefixDC + ''title'', value); - } else if(field == "Author(s)") { - var authors = value.split(", "); - for(j in authors) { - model.addStatement(uri, prefixDC + ''creator'', authors[j]); - } - } else if(field == "Editor(s)") { - var authors = value.split(", "); - for(j in authors) { - model.addStatement(uri, prefixDummy + ''editor'', authors[j]); - } - } else if(field == "Publisher") { - model.addStatement(uri, prefixDC + ''publisher'', value); - } else if(field == "Publication Date") { - var date = value; - - jsDate = new Date(value); - if(!isNaN(jsDate.valueOf())) { - date = utilities.dateToISO(jsDate); + Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { + var newDoc = newBrowser.contentDocument; + var newItem = new Scholar.Item("book"); + newItem.source = newDoc.location.href; + + var namespace = newDoc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var xpath = ''//table[@id="bib"]/tbody/tr''; + var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); + for(var i = 0; i<elmts.length; i++) { + var field = Scholar.Utilities.getNode(newDoc, elmts[i], ''./td[1]//text()'', nsResolver); + var value = Scholar.Utilities.getNode(newDoc, elmts[i], ''./td[2]//text()'', nsResolver); + + if(field && value) { + field = Scholar.Utilities.superCleanString(field.nodeValue); + value = Scholar.Utilities.cleanString(value.nodeValue); + if(field == "Title") { + newItem.title = value; + } else if(field == "Author(s)") { + var authors = value.split(", "); + for(j in authors) { + newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author")); + } + } else if(field == "Editor(s)") { + var authors = value.split(", "); + for(j in authors) { + newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "editor")); + } + } else if(field == "Publisher") { + newItem.publisher = value; + } else if(field == "Publication Date") { + var date = value; + + jsDate = new Date(value); + if(!isNaN(jsDate.valueOf())) { + date = Scholar.Utilities.dateToISO(jsDate); + } + + newItem.date = date; + /*} else if(field == "Format") { + .addStatement(uri, prefixDC + ''medium'', value);*/ + } else if(field == "ISBN") { + newItem.ISBN = value; } - - model.addStatement(uri, prefixDC + ''date'', date); - } else if(field == "Format") { - model.addStatement(uri, prefixDC + ''medium'', value); - } else if(field == "ISBN") { - model.addStatement(uri, prefixDC + ''identifier'', ''ISBN ''+value); } } - } - model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); -}, function() { done(); }, function() {}); - -wait();'); + newItem.complete(); + }, function() { Scholar.done(); }, function() {}); + + Scholar.wait(); +}'); REPLACE INTO "translators" VALUES ('0e2235e7-babf-413c-9acf-f27cce5f059c', '2006-07-05 23:40:00', 2, 'MODS (XML)', 'Simon Kornblith', 'xml', -'addOption("exportNotes", true); -addOption("exportFileData", true);', +'Scholar.addOption("exportNotes", true); +Scholar.addOption("exportFileData", true);', 'var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"]; var rdf = new Namespace("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#"); var rdfs = new Namespace("rdfs", "http://www.w3.org/2000/01/rdf-schema#"); @@ -2513,15 +2473,14 @@ function generateSeeAlso(id, seeAlso, rdfDoc) { rdfDoc.rdf::description += description; } -function translate(items, collections) { +function doExport() { //var rdfDoc = <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" />; var modsCollection = <modsCollection xmlns="http://www.loc.gov/mods/v3" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.loc.gov/mods/v3 http://www.loc.gov/standards/mods/v3/mods-3-2.xsd" />; - for(var i in items) { - var item = items[i]; - + var item; + while(item = Scholar.nextItem()) { var isPartialItem = false; - if(utilities.inArray(item.itemType, partialItemTypes)) { + if(Scholar.Utilities.inArray(item.itemType, partialItemTypes)) { isPartialItem = true; } @@ -2607,7 +2566,7 @@ function translate(items, collections) { } // XML tag recordInfo.recordOrigin; used to store our generator note - //mods.recordInfo.recordOrigin = "Scholar for Firefox "+utilities.getVersion(); + //mods.recordInfo.recordOrigin = "Scholar for Firefox "+Scholar.Utilities.getVersion(); /** FIELDS ON NEARLY EVERYTHING BUT NOT A PART OF THE CORE **/ @@ -2646,7 +2605,7 @@ function translate(items, collections) { // XML tag detail; object field volume if(item.volume) { - if(utilities.isInt(item.volume)) { + if(Scholar.Utilities.isInt(item.volume)) { part += <detail type="volume"><number>{item.volume}</number></detail>; } else { part += <detail type="volume"><text>{item.volume}</text></detail>; @@ -2655,7 +2614,7 @@ function translate(items, collections) { // XML tag detail; object field number if(item.number) { - if(utilities.isInt(item.number)) { + if(Scholar.Utilities.isInt(item.number)) { part += <detail type="issue"><number>{item.number}</number></detail>; } else { part += <detail type="issue"><text>{item.number}</text></detail>; @@ -2664,7 +2623,7 @@ function translate(items, collections) { // XML tag detail; object field section if(item.section) { - if(utilities.isInt(item.section)) { + if(Scholar.Utilities.isInt(item.section)) { part += <detail type="section"><number>{item.section}</number></detail>; } else { part += <detail type="section"><text>{item.section}</text></detail>; @@ -2673,7 +2632,7 @@ function translate(items, collections) { // XML tag detail; object field pages if(item.pages) { - var range = utilities.getPageRange(item.pages); + var range = Scholar.Utilities.getPageRange(item.pages); part += <extent unit="pages"><start>{range[0]}</start><end>{range[1]}</end></extent>; } @@ -2804,33 +2763,35 @@ function translate(items, collections) { } modsCollection.rdf::RDF = rdfDoc;*/ - write(''<?xml version="1.0"?>''+"\n"); - write(modsCollection.toXMLString()); + Scholar.write(''<?xml version="1.0"?>''+"\n"); + Scholar.write(modsCollection.toXMLString()); }'); REPLACE INTO "translators" VALUES ('14763d24-8ba0-45df-8f52-b8d1108e7ac9', '2006-07-07 12:44:00', 2, 'Biblio/DC/FOAF/PRISM/VCard (RDF/XML)', 'Simon Kornblith', 'rdf', -'configure("getCollections", true); -configure("dataMode", "rdf");', +'Scholar.configure("getCollections", true); +Scholar.configure("dataMode", "rdf"); +Scholar.addOption("exportNotes", true); +Scholar.addOption("exportFileData", true);', 'function generateSeeAlso(resource, seeAlso) { for(var i in seeAlso) { - model.addStatement(resource, n.dc+"relation", itemResources[seeAlso[i]], false); + Scholar.RDF.addStatement(resource, n.dc+"relation", itemResources[seeAlso[i]], false); } } function generateCollection(collection) { var collectionResource = "#collection:"+collection.id; - model.addStatement(collectionResource, rdf+"type", n.bib+"Collection", false); + Scholar.RDF.addStatement(collectionResource, rdf+"type", n.bib+"Collection", false); for(var i in collection.children) { var child = collection.children[i]; // add child list items if(child.type == "collection") { - model.addStatement(collectionResource, n.dc+"hasPart", "#collection:"+child.id, false); + Scholar.RDF.addStatement(collectionResource, n.dc+"hasPart", "#collection:"+child.id, false); // do recursive processing of collections generateCollection(child); } else { - model.addStatement(collectionResource, n.dc+"hasPart", itemResources[child.id], false); + Scholar.RDF.addStatement(collectionResource, n.dc+"hasPart", itemResources[child.id], false); } } } @@ -2840,9 +2801,9 @@ function getContainerIfExists() { if(containerElement) { return containerElement; } else { - containerElement = model.newResource(); + containerElement = Scholar.RDF.newResource(); // attach container to section (if exists) or resource - model.addStatement((section ? section : resource), n.dcterms+"isPartOf", containerElement, false); + Scholar.RDF.addStatement((section ? section : resource), n.dcterms+"isPartOf", containerElement, false); return containerElement; } } else { @@ -2850,7 +2811,7 @@ function getContainerIfExists() { } } -function translate(items, collections) { +function doExport() { rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#"; n = { @@ -2864,7 +2825,7 @@ function translate(items, collections) { // add namespaces for(var i in n) { - model.addNamespace(i, n[i]); + Scholar.RDF.addNamespace(i, n[i]); } // leave as global @@ -2888,7 +2849,8 @@ function translate(items, collections) { } } - for(var i in items) { + var item; + while(item = Scholar.nextItem()) { // these items are global item = items[i]; resource = itemResources[item.itemID]; @@ -2901,7 +2863,7 @@ function translate(items, collections) { // title if(item.title) { - model.addStatement(resource, n.dc+"title", item.title, true); + Scholar.RDF.addStatement(resource, n.dc+"title", item.title, true); } // type @@ -2938,18 +2900,18 @@ function translate(items, collections) { type = "Memo"; } if(type) { - model.addStatement(resource, rdf+"type", n.bib+type, false); + Scholar.RDF.addStatement(resource, rdf+"type", n.bib+type, false); } // authors/editors/contributors var creatorContainers = new Object(); for(var j in item.creators) { - var creator = model.newResource(); - model.addStatement(creator, rdf+"type", n.foaf+"Person", false); + var creator = Scholar.RDF.newResource(); + Scholar.RDF.addStatement(creator, rdf+"type", n.foaf+"Person", false); // gee. an entire vocabulary for describing people, and these aren''t even // standardized in it. oh well. using them anyway. - model.addStatement(creator, n.foaf+"surname", item.creators[j].lastName, true); - model.addStatement(creator, n.foaf+"givenname", item.creators[j].firstName, true); + Scholar.RDF.addStatement(creator, n.foaf+"surname", item.creators[j].lastName, true); + Scholar.RDF.addStatement(creator, n.foaf+"givenname", item.creators[j].firstName, true); // in addition, these tags are not yet in Biblio, but Bruce D''Arcus // says they will be. @@ -2962,142 +2924,142 @@ function translate(items, collections) { } if(!creatorContainers[cTag]) { - var creatorResource = model.newResource(); + var creatorResource = Scholar.RDF.newResource(); // create new seq for author type - creatorContainers[cTag] = model.newContainer("seq", creatorResource); + creatorContainers[cTag] = Scholar.RDF.newContainer("seq", creatorResource); // attach container to resource - model.addStatement(resource, n.bib+cTag, creatorResource, false); + Scholar.RDF.addStatement(resource, n.bib+cTag, creatorResource, false); } - model.addContainerElement(creatorContainers[cTag], creator, true); + Scholar.RDF.addContainerElement(creatorContainers[cTag], creator, true); } /** FIELDS ON NEARLY EVERYTHING BUT NOT A PART OF THE CORE **/ // source if(item.source) { - model.addStatement(resource, n.dc+"source", item.source, true); + Scholar.RDF.addStatement(resource, n.dc+"source", item.source, true); } // accessionNumber as generic ID if(item.accessionNumber) { - model.addStatement(resource, n.dc+"identifier", item.accessionNumber, true); + Scholar.RDF.addStatement(resource, n.dc+"identifier", item.accessionNumber, true); } // rights if(item.rights) { - model.addStatement(resource, n.dc+"rights", item.rights, true); + Scholar.RDF.addStatement(resource, n.dc+"rights", item.rights, true); } /** SUPPLEMENTAL FIELDS **/ // use section to set up another container element if(item.section) { - section = model.newResource(); // leave as global + section = Scholar.RDF.newResource(); // leave as global // set section type - model.addStatement(section, rdf+"type", n.bib+"Part", false); + Scholar.RDF.addStatement(section, rdf+"type", n.bib+"Part", false); // set section title - model.addStatement(section, n.dc+"title", item.section, true); + Scholar.RDF.addStatement(section, n.dc+"title", item.section, true); // add relationship to resource - model.addStatement(resource, n.dc+"isPartOf", section, false); + Scholar.RDF.addStatement(resource, n.dc+"isPartOf", section, false); } // use ISSN to set up container element if(item.ISSN) { containerElement = "urn:issn:"+item.ISSN; // leave as global // attach container to section (if exists) or resource - model.addStatement((section ? section : resource), n.dcterms+"isPartOf", containerElement, false); + Scholar.RDF.addStatement((section ? section : resource), n.dcterms+"isPartOf", containerElement, false); } // publication gets linked to container via isPartOf if(item.publication) { - model.addStatement(getContainerIfExists(), n.dc+"title", item.publication, true); + Scholar.RDF.addStatement(getContainerIfExists(), n.dc+"title", item.publication, true); } // series also linked in if(item.series) { - var series = model.newResource(); + var series = Scholar.RDF.newResource(); // set series type - model.addStatement(series, rdf+"type", n.bib+"Series", false); + Scholar.RDF.addStatement(series, rdf+"type", n.bib+"Series", false); // set series title - model.addStatement(series, n.dc+"title", item.series, true); + Scholar.RDF.addStatement(series, n.dc+"title", item.series, true); // add relationship to resource - model.addStatement(getContainerIfExists(), n.dcterms+"isPartOf", series, false); + Scholar.RDF.addStatement(getContainerIfExists(), n.dcterms+"isPartOf", series, false); } // volume if(item.volume) { - model.addStatement(getContainerIfExists(), n.prism+"volume", item.volume, true); + Scholar.RDF.addStatement(getContainerIfExists(), n.prism+"volume", item.volume, true); } // number if(item.number) { - model.addStatement(getContainerIfExists(), n.prism+"number", item.number, true); + Scholar.RDF.addStatement(getContainerIfExists(), n.prism+"number", item.number, true); } // edition if(item.edition) { - model.addStatement(resource, n.prism+"edition", item.edition, true); + Scholar.RDF.addStatement(resource, n.prism+"edition", item.edition, true); } // publisher/distributor and place if(item.publisher || item.distributor || item.place) { - var organization = model.newResource(); + var organization = Scholar.RDF.newResource(); // set organization type - model.addStatement(organization, rdf+"type", n.foaf+"Organization", false); + Scholar.RDF.addStatement(organization, rdf+"type", n.foaf+"Organization", false); // add relationship to resource - model.addStatement(resource, n.dc+"publisher", organization, false); + Scholar.RDF.addStatement(resource, n.dc+"publisher", organization, false); // add publisher/distributor if(item.publisher) { - model.addStatement(organization, n.foaf+"name", item.publisher, true); + Scholar.RDF.addStatement(organization, n.foaf+"name", item.publisher, true); } else if(item.distributor) { - model.addStatement(organization, n.foaf+"name", item.distributor, true); + Scholar.RDF.addStatement(organization, n.foaf+"name", item.distributor, true); } // add place if(item.place) { - var address = model.newResource(); + var address = Scholar.RDF.newResource(); // set address type - model.addStatement(address, rdf+"type", n.vcard+"Address", false); + Scholar.RDF.addStatement(address, rdf+"type", n.vcard+"Address", false); // set address locality - model.addStatement(address, n.vcard+"locality", item.place, true); + Scholar.RDF.addStatement(address, n.vcard+"locality", item.place, true); // add relationship to organization - model.addStatement(organization, n.vcard+"adr", address, false); + Scholar.RDF.addStatement(organization, n.vcard+"adr", address, false); } } // date/year if(item.date) { - model.addStatement(resource, n.dc+"date", item.date, true); + Scholar.RDF.addStatement(resource, n.dc+"date", item.date, true); } else if(item.year) { - model.addStatement(resource, n.dc+"year", item.year, true); + Scholar.RDF.addStatement(resource, n.dc+"year", item.year, true); } // callNumber if(item.callNumber) { - var term = model.newResource(); + var term = Scholar.RDF.newResource(); // set term type - model.addStatement(term, rdf+"type", n.dcterms+"LCC", false); + Scholar.RDF.addStatement(term, rdf+"type", n.dcterms+"LCC", false); // set callNumber value - model.addStatement(term, rdf+"value", item.callNumber, true); + Scholar.RDF.addStatement(term, rdf+"value", item.callNumber, true); // add relationship to resource - model.addStatement(resource, n.dc+"subject", term, false); + Scholar.RDF.addStatement(resource, n.dc+"subject", term, false); } // archiveLocation if(item.archiveLocation) { - model.addStatement(resource, n.dc+"coverage", item.archiveLocation, true); + Scholar.RDF.addStatement(resource, n.dc+"coverage", item.archiveLocation, true); } // medium if(item.medium) { - model.addStatement(resource, n.dc+"medium", item.medium, true); + Scholar.RDF.addStatement(resource, n.dc+"medium", item.medium, true); } // type (not itemType) if(item.type) { - model.addStatement(resource, n.dc+"type", item.type, true); + Scholar.RDF.addStatement(resource, n.dc+"type", item.type, true); } else if(item.thesisType) { - model.addStatement(resource, n.dc+"type", item.thesisType, true); + Scholar.RDF.addStatement(resource, n.dc+"type", item.thesisType, true); } // THIS IS NOT YET IN THE BIBLIO NAMESPACE, BUT BRUCE D''ARCUS HAS SAID // IT WILL BE SOON if(item.pages) { - model.addStatement(resource, n.bib+"pages", item.pages, true); + Scholar.RDF.addStatement(resource, n.bib+"pages", item.pages, true); } /** NOTES **/ @@ -3106,25 +3068,25 @@ function translate(items, collections) { var noteResource = itemResources[item.notes[j].itemID]; // add note tag - model.addStatement(noteResource, rdf+"type", n.bib+"Memo", false); + Scholar.RDF.addStatement(noteResource, rdf+"type", n.bib+"Memo", false); // add note description (sorry, couldn''t find a better way of // representing this data in an existing ontology) - model.addStatement(noteResource, n.dc+"description", item.notes[j].note, true); + Scholar.RDF.addStatement(noteResource, n.dc+"description", item.notes[j].note, true); // add relationship between resource and note - model.addStatement(resource, n.dcterms+"isReferencedBy", noteResource, false); + Scholar.RDF.addStatement(resource, n.dcterms+"isReferencedBy", noteResource, false); // Add see also info to RDF generateSeeAlso(item.notes[j].itemID, item.notes[j].seeAlso); } if(item.note) { - model.addStatement(resource, n.dc+"description", item.note, true); + Scholar.RDF.addStatement(resource, n.dc+"description", item.note, true); } /** TAGS **/ for(var j in item.tags) { - model.addStatement(resource, n.dc+"subject", item.tags[j], true); + Scholar.RDF.addStatement(resource, n.dc+"subject", item.tags[j], true); } // Add see also info to RDF @@ -3134,31 +3096,24 @@ function translate(items, collections) { } /** RDF COLLECTION STRUCTURE **/ - for(var i in collections) { - generateCollection(collections[i]); + var collection; + while(collection = Scholar.nextCollection()) { + generateCollection(collection); } }'); REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006-07-05 23:40:00', 2, 'Unqualified Dublin Core (RDF/XML)', 'Simon Kornblith', 'rdf', -'configure("dataMode", "rdf");', -'function translate(items) { - var partialItemTypes = ["bookSection", "journalArticle", "magazineArticle", "newspaperArticle"]; - +'Scholar.configure("dataMode", "rdf");', +'function doExport() { var dc = "http://purl.org/dc/elements/1.1/"; - model.addNamespace("dc", dc); + Scholar.RDF.addNamespace("dc", dc); - for(var i in items) { - var item = items[i]; - + var item; + while(item = Scholar.nextItem()) { if(item.itemType == "note") { continue; } - var isPartialItem = false; - if(utilities.inArray(item.itemType, partialItemTypes)) { - isPartialItem = true; - } - var resource; if(item.ISBN) { resource = "urn:isbn:"+item.ISBN; @@ -3166,18 +3121,18 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 resource = item.url; } else { // just specify a node ID - resource = model.newResource(); + resource = Scholar.RDF.newResource(); } /** CORE FIELDS **/ // title if(item.title) { - model.addStatement(resource, dc+"title", item.title, true); + Scholar.RDF.addStatement(resource, dc+"title", item.title, true); } // type - model.addStatement(resource, dc+"type", item.itemType, true); + Scholar.RDF.addStatement(resource, dc+"type", item.itemType, true); // creators for(var j in item.creators) { @@ -3188,9 +3143,9 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 } if(item.creators[j].creatorType == "author") { - model.addStatement(resource, dc+"creator", creator, true); + Scholar.RDF.addStatement(resource, dc+"creator", creator, true); } else { - model.addStatement(resource, dc+"contributor", creator, true); + Scholar.RDF.addStatement(resource, dc+"contributor", creator, true); } } @@ -3198,17 +3153,17 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 // source if(item.source) { - model.addStatement(resource, dc+"source", item.source, true); + Scholar.RDF.addStatement(resource, dc+"source", item.source, true); } // accessionNumber as generic ID if(item.accessionNumber) { - model.addStatement(resource, dc+"identifier", item.accessionNumber, true); + Scholar.RDF.addStatement(resource, dc+"identifier", item.accessionNumber, true); } // rights if(item.rights) { - model.addStatement(resource, dc+"rights", item.rights, true); + Scholar.RDF.addStatement(resource, dc+"rights", item.rights, true); } /** SUPPLEMENTAL FIELDS **/ @@ -3217,84 +3172,349 @@ REPLACE INTO "translators" VALUES ('6e372642-ed9d-4934-b5d1-c11ac758ebb7', '2006 // publisher/distributor if(item.publisher) { - model.addStatement(resource, dc+"publisher", item.publisher, true); + Scholar.RDF.addStatement(resource, dc+"publisher", item.publisher, true); } else if(item.distributor) { - model.addStatement(resource, dc+"publisher", item.distributor, true); + Scholar.RDF.addStatement(resource, dc+"publisher", item.distributor, true); } // date/year if(item.date) { - model.addStatement(resource, dc+"date", item.date, true); + Scholar.RDF.addStatement(resource, dc+"date", item.date, true); } else if(item.year) { - model.addStatement(resource, dc+"year", item.year, true); + Scholar.RDF.addStatement(resource, dc+"year", item.year, true); } // ISBN/ISSN if(item.ISBN) { - model.addStatement(resource, dc+"identifier", "ISBN "+item.ISBN, true); + Scholar.RDF.addStatement(resource, dc+"identifier", "ISBN "+item.ISBN, true); } else if(item.ISSN) { - model.addStatement(resource, dc+"identifier", "ISSN "+item.ISSN, true); + Scholar.RDF.addStatement(resource, dc+"identifier", "ISSN "+item.ISSN, true); } // callNumber if(item.callNumber) { - model.addStatement(resource, dc+"identifier", item.callNumber, true); + Scholar.RDF.addStatement(resource, dc+"identifier", item.callNumber, true); } // archiveLocation if(item.archiveLocation) { - model.addStatement(resource, dc+"coverage", item.archiveLocation, true); + Scholar.RDF.addStatement(resource, dc+"coverage", item.archiveLocation, true); } } }'); -REPLACE INTO "translators" VALUES ('32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7', '2006-06-30 15:36:00', 2, 'RIS', 'Simon Kornblith', 'ris', -'addOption("exportNotes", true); -addOption("exportFileData", true);', -'function addTag(tag, value) { - if(value) { - write(tag+" - "+value+"\r\n"); +REPLACE INTO "translators" VALUES ('5e3ad958-ac79-463d-812b-a86a9235c28f', '2006-07-15 17:09:00', 1, 'RDF', 'Simon Kornblith', 'rdf', +'Scholar.configure("dataMode", "rdf");', +'function getFirstResults(node, properties, onlyOneString) { + for(var i=0; i<properties.length; i++) { + var result = Scholar.RDF.getTargets(node, properties[i]); + if(result) { + if(onlyOneString) { + // onlyOneString means we won''t return nsIRDFResources, only + // actual literals + return result[0]; + } else { + return result; + } + } } + return; // return undefined on failure } -function translate(items) { - for(var i in items) { - var item = items[i]; +function doImport() { + n = { + bib:"http://purl.org/net/biblio#", + dc:"http://purl.org/dc/elements/1.1/", + dcterms:"http://purl.org/dc/terms/", + prism:"http://prismstandard.org/namespaces/1.2/basic/", + foaf:"http://xmlns.com/foaf/0.1/", + vcard:"http://nwalsh.com/rdf/vCard" + }; + + var nodes = Scholar.RDF.getAllResources(); + if(!nodes) { + return false; + } + + for(var i in nodes) { + var node = nodes[i]; + + if(Scholar.RDF.getArcsIn(node)) { + // root nodes only, please + continue; + } + + var newItem = new Scholar.Item(); - // can''t store notes in RIS + // title + newItem.title = getFirstResults(node, [n.dc+"title"], true); + if(!newItem.title) { // require the title + continue; + } + + // creators + var creators = getFirstResults(node, [n.dc+"creator"]); + Scholar.Utilities.debugPrint(creators); + if(creators) { + for(var i in creators) { + if(typeof(creators[i]) != "object") { + newItem.creators.push(Scholar.Utilities.cleanAuthor(creators[i], "author", true)); + } + } + } + + // source + newItem.source = getFirstResults(node, [n.dc+"source"], true); + + // rights + newItem.rights = getFirstResults(node, [n.dc+"rights"], true); + + // publisher + newItem.publisher = getFirstResults(node, [n.dc+"publisher"], true); + // (this will get ignored except for films, where we encode distributor as publisher) + newItem.distributor = getFirstResults(node, [n.dc+"publisher"], true); + + // date + newItem.date = getFirstResults(node, [n.dc+"date"], true); + + // year + newItem.year = getFirstResults(node, [n.dc+"year"], true); + + // identifier + var identifiers = getFirstResults(node, [n.dc+"identifier"]); + if(identifiers) { + for(var i in identifiers) { + var firstFour = identifiers[i].substr(0, 4).toUpperCase(); + + if(firstFour == "ISBN") { + newItem.ISBN = identifiers[i].substr(5).toUpperCase(); + } else if(firstFour == "ISSN") { + newItem.ISSN = identifiers[i].substr(5).toUpperCase(); + } + } + } + + // identifier + newItem.coverage = getFirstResults(node, [n.dc+"coverage"]); + + newItem.complete(); + } +}'); + +REPLACE INTO "translators" VALUES ('32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7', '2006-06-30 15:36:00', 3, 'RIS', 'Simon Kornblith', 'ris', +'Scholar.configure("dataMode", "line"); +Scholar.addOption("exportNotes", true);', +'var itemsWithYears = ["book", "bookSection", "thesis", "film"]; + +var fieldMap = { + ID:"itemID", + T1:"title", + T3:"series", + JF:"publication", + VL:"volume", + IS:"number", + CP:"place", + PB:"publisher" +}; + +var inputFieldMap = { + TI:"title", + CT:"title", + JO:"publication", + CY:"place" +}; + +// TODO: figure out if these are the best types for letter, interview, website, manuscript +var typeMap = { + book:"BOOK", + bookSection:"CHAP", + journalArticle:"JOUR", + magazineArticle:"MGZN", + newspaperArticle:"NEWS", + thesis:"THES", + letter:"PCOMM", + manuscript:"UNPB", + interview:"PCOMM", + film:"MPCT", + artwork:"ART", + website:"ELEC" +}; + +// supplements outputTypeMap for importing +// TODO: BILL, CASE, COMP, CONF, DATA, HEAR, MUSIC, PAT, SOUND, STAT +var inputTypeMap = { + ABST:"journalArticle", + ADVS:"film", + CTLG:"magazineArticle", + GEN:"book", + INPR:"manuscript", + JFULL:"journalArticle", + MAP:"artwork", + PAMP:"book", + RPRT:"book", + SER:"book", + SLIDE:"artwork", + UNBILL:"manuscript", + VIDEO:"film" +}; + +function processTag(item, tag, value) { + if(fieldMap[tag]) { + item[fieldMap[tag]] = value; + } else if(tag == "TY") { + // look for type + + // first check typeMap + for(var i in typeMap) { + if(value == typeMap[i]) { + item.itemType = i; + } + } + // then check inputTypeMap + if(!item.itemType) { + if(inputTypeMap[value]) { + item.itemType = inputTypeMap[value]; + } else { + // default to generic from inputTypeMap + item.itemType = inputTypeMap["GEN"]; + } + } + } else if(tag == "BT") { + // ignore, unless this is a book or unpublished work, as per spec + if(item.itemType == "book" || item.itemType == "manuscript") { + item.title = value; + } + } else if(tag == "A1" || tag == "AU") { + // primary author + var names = value.split(","); + item.creators.push({lastName:names[0], firstName:names[1], creatorType:"author"}); + } else if(tag == "A2" || tag == "ED") { + // contributing author + var names = value.split(","); + item.creators.push({lastName:names[0], firstName:names[1], creatorType:"contributor"}); + } else if(tag == "Y1" || tag == "PY") { + // year or date + var dateParts = value.split("/"); + + if(dateParts.length == 1) { + // technically, if there''s only one date part, the file isn''t valid + // RIS, but EndNote accepts this, so we have to too + item.date = value+"-00-00"; + } else if(dateParts[1].length == 0 && dateParts[2].length == 0 && dateParts[3] && dateParts[3].length != 0) { + // in the case that we have a year and other data, format that way + item.date = dateParts[3]+(dateParts[0] ? " "+dateParts[0] : ""); + } else { + // standard YMD data + item.date = Scholar.Utilities.lpad(dateParts[0], "0", 4)+"-"+Scholar.Utilities.lpad(dateParts[1], "0", 2)+"-"+Scholar.Utilities.lpad(dateParts[2], "0", 2); + } + } else if(tag == "N1" || tag == "AB") { + // notes + item.notes.push({note:value}); + } else if(tag == "KW") { + // keywords/tags + item.tags.push(value); + } else if(tag == "SP") { + // start page + if(!item.pages) { + item.pages = value; + } else if(item.pages[0] == "-") { // already have ending page + item.pages = value + item.pages; + } else { // multiple ranges? hey, it''s a possibility + item.pages += ", "+value; + } + } else if(tag == "EP") { + // end page + if(value) { + if(!item.pages || value != item.pages) { + if(!item.pages) { + item.pages = ""; + } + item.pages += "-"+value; + } + } + } else if(tag == "SN") { + // ISSN/ISBN - just add both + if(!item.ISBN) { + item.ISBN = value; + } + if(!item.ISSN) { + item.ISSN = value; + } + } else if(tag == "UR") { + // URL + item.url = value; + } +} + +function doImport() { + var line = true; + var tag = data = false; + do { // first valid line is type + line = Scholar.read(); + Scholar.Utilities.debugPrint(line); + } while(line !== false && line.substr(0, 6) != "TY - "); + + var item = new Scholar.Item(); + var tag = "TY"; + var data = line.substr(6); + + while((line = Scholar.read()) !== false) { // until EOF + if(line.substr(2, 4) == " - ") { + // if this line is a tag, take a look at the previous line to map + // its tag + if(tag) { + processTag(item, tag, data); + } + + // then fetch the tag and data from this line + tag = line.substr(0,2); + data = line.substr(6); + + Scholar.Utilities.debugPrint("tag: ''"+tag+"''; data: ''"+data+"''"); + + if(tag == "ER") { // ER signals end of reference + // unset info + tag = data = false; + // new item + item.complete(); + item = new Scholar.Item(); + } + } else { + // otherwise, assume this is data from the previous line continued + if(tag) { + data += line; + } + } + } + + if(tag) { // save any unprocessed tags + processTag(item, tag, data); + item.complete(); + } +} + +function addTag(tag, value) { + if(value) { + Scholar.write(tag+" - "+value+"\r\n"); + } +} + +function doExport() { + var item; + + while(item = Scholar.nextItem()) { + // can''t store independent notes in RIS if(item.itemType == "note") { continue; } // type - // TODO - figure out if these are the best types for letter, interview, website - if(item.itemType == "book") { - var risType = "BOOK"; - } else if(item.itemType == "bookSection") { - var risType = "CHAP"; - } else if(item.itemType == "journalArticle") { - var risType = "JOUR"; - } else if(item.itemType == "magazineArticle") { - var risType = "MGZN"; - } else if(item.itemType == "newspaperArticle") { - var risType = "NEWS"; - } else if(item.itemType == "thesis") { - var risType = "THES"; - } else if(item.itemType == "letter" || item.itemType == "interview") { - var risType = "PCOMM"; - } else if(item.itemType == "film") { - var risType = "MPCT"; - } else if(item.itemType == "artwork") { - var risType = "ART"; - } else if(item.itemType == "website") { - var risType = "ICOMM"; - } - addTag("TY", risType); - // ID - addTag("ID", item.itemID); - // primary title - addTag("T1", item.title); - // series title - addTag("T3", item.series); + addTag("TY", typeMap[item.itemType]); + + // use field map + for(var j in fieldMap) { + addTag(j, item[fieldMap[j]]); + } + // creators for(var j in item.creators) { // only two types, primary and secondary @@ -3305,6 +3525,7 @@ function translate(items) { addTag(risTag, item.creators[j].lastName+","+item.creators[j].firstName); } + // date if(item.date) { var isoDate = /^[0-9]{4}-[0-9]{2}-[0-9]{2}$/; @@ -3320,35 +3541,536 @@ function translate(items) { } else if(item.year) { addTag("Y1", item.year+"///"); } + // notes for(var j in item.notes) { addTag("N1", item.notes[j].note); } - // publication - addTag("JF", item.publication); - // volume - addTag("VL", item.volume); - // number - addTag("IS", item.number); + + // tags + for(var j in item.tags) { + addTag("KY", item.tags[j]); + } + // pages if(item.pages) { - var range = utilities.getPageRange(item.pages); + var range = Scholar.Utilities.getPageRange(item.pages); addTag("SP", range[0]); addTag("EP", range[1]); } - // place - addTag("CP", item.place); - // publisher - addTag("PB", item.publisher); + // ISBN/ISSN addTag("SN", item.ISBN); addTag("SN", item.ISSN); + // URL if(item.url) { addTag("UR", item.url); } else if(item.source && item.source.substr(0, 7) == "http://") { addTag("UR", item.source); } - write("\r\n"); + + Scholar.write("ER - \r\n\r\n"); + } +}'); + +REPLACE INTO "translators" VALUES ('a6ee60df-1ddc-4aae-bb25-45e0537be973', '2006-07-16 17:18:00', 1, 'MARC', 'Simon Kornblith', 'marc', +NULL, +'/* +* Original version of MARC record library copyright (C) 2005 Stefano Bargioni, +* licensed under the LGPL +* +* (Available at http://www.pusc.it/bib/mel/Scholar.Ingester.MARC_Record.js) +* +* This library is free software; you can redistribute it or +* modify it under the terms of the GNU General Public +* License as published by the Free Software Foundation; either +* version 2 of the License, or (at your option) any later version. +* +* This library is distributed in the hope that it will be useful, +* but WITHOUT ANY WARRANTY; without even the implied warranty of +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +* General Public License for more details. +*/ + +var MARC_Record = function() { // new MARC record + this.leader = { + record_length:''00000'', + record_status:''n'', // acdnp + type_of_record:'' '', + bibliographic_level:'' '', + type_of_control:'' '', + character_coding_scheme:'' '', + indicator_count:''2'', + subfield_code_length:''2'', + base_address_of_data:''00000'', + encoding_level:'' '', + descriptive_cataloging_form:'' '', + linked_record_requirement:'' '', + entry_map:''4500'' + }; // 24 chars + + this.field_terminator = ''\x1E''; + this.record_terminator = ''\x1D''; + this.subfield_delimiter = ''\x1F''; + this.directory = ''''; + this.directory_terminator = this.field_terminator; + this.variable_fields = new Array(); +}; + +MARC_Record.prototype.load = function(s,f) { // loads record s passed in format f + if (f == ''binary'') { + this.leader.record_length = ''00000''; + this.leader.record_status = s.substr(5,1); + this.leader.type_of_record = s.substr(6,1); + this.leader.bibliographic_level = s.substr(7,1); + this.leader.type_of_control = s.substr(8,1); + this.leader.character_coding_scheme = s.substr(9,1); + this.leader.indicator_count = ''2''; + this.leader.subfield_code_length = ''2''; + this.leader.base_address_of_data = ''00000''; + this.leader.encoding_level = s.substr(17,1); + this.leader.descriptive_cataloging_form = s.substr(18,1); + this.leader.linked_record_requirement = s.substr(19,1); + this.leader.entry_map = ''4500''; + + this.directory = ''''; + this.directory_terminator = this.field_terminator; + this.variable_fields = new Array(); + + // loads fields + var campi = s.split(this.field_terminator); + var k; + for (k=1; k<-1+campi.length; k++) { // the first and the last are unuseful + // the first is the header + directory, the last is the this.record_terminator + var tag = campi[0].substr(24+(k-1)*12,3); + var ind1 = ''''; var ind2 = ''''; var value = campi[k]; + if (tag.substr(0,2) != ''00'') { + ind1 = campi[k].substr(0,1); + ind2 = campi[k].substr(1,1); + value = campi[k].substr(2); + } + this.add_field(tag,ind1,ind2,value); + } + } + + this.update_record_length(); + this.update_base_address_of_data(); + return this; +} + +MARC_Record.prototype.update_base_address_of_data = function() { // updates the base_address + this.leader.base_address_of_data = this._zero_fill(24+this.variable_fields.length*12+1,5); + return this.leader.base_address_of_data; +} + +MARC_Record.prototype.update_displacements = function() { // rebuilds the directory + var displ = 0; + this.directory = ''''; + for (var i=0; i<this.variable_fields.length; i++) { + var len = this.variable_fields[i].value.length + 1 + + this.variable_fields[i].ind1.length + + this.variable_fields[i].ind2.length; + this.directory += this.variable_fields[i].tag + + this._zero_fill(len,4) + this._zero_fill(displ,5); + displ += len; + } + return true; +} +MARC_Record.prototype.update_record_length = function() { // updates total record length + var fields_total_length = 0; var f; + for (f=0; f<this.variable_fields.length;f++) { + fields_total_length += this.variable_fields[f].ind1.length+this.variable_fields[f].ind2.length+this.variable_fields[f].value.length + 1; + } + var rl = 24+this.directory.length+1+fields_total_length+1; + this.leader.record_length = this._zero_fill(rl,5); +} + +MARC_Record.prototype.sort_directory = function() { // sorts directory and array variable_fields by tag and occ + // ordinamento della directory + if (this.directory.length <= 12) { return true; } // already sorted + var directory_entries = new Array(); + var i; + for (i=0; i<this.directory.length; i=i+12) { + directory_entries[directory_entries.length] = this.directory.substr(i,12); + } + directory_entries.sort(); + this.directory = directory_entries.join(''''); + // sorts array variable_fields + this.variable_fields.sort(function(a,b) { return a.tag - b.tag + a.occ - b.occ; }); + return true; +} + +MARC_Record.prototype.show_leader = function() { + var leader = ''''; var f; + for (f in this.leader) { leader += this.leader[f]; } + return leader; +} + +MARC_Record.prototype.show_fields = function() { + var fields = ''''; var f; + for (f=0; f<this.variable_fields.length;f++) { + fields += this.variable_fields[f].ind1 + + this.variable_fields[f].ind2 + + this.variable_fields[f].value + + this.field_terminator; + } + return fields; +} + +MARC_Record.prototype.show_directory = function() { + var d = ''''; + for (var i = 0; i<this.directory.length; i+=12) { + d += this.directory.substr(i,3) + '' '' + + this.directory.substr(i+3,4) + '' '' + + this.directory.substr(i+7,5) + ''\n''; + } + return d; +} + +MARC_Record.prototype.add_field_005 = function() { + var now = new Date(); + now = now.getFullYear() + + this._zero_fill(now.getMonth()+1,2) + + this._zero_fill(now.getDate(),2) + + this._zero_fill(now.getHours(),2) + + this._zero_fill(now.getMinutes(),2) + + this._zero_fill(now.getSeconds(),2) + ''.0''; + this.add_field(''005'','''','''',now); + return now; +} + +MARC_Record.prototype.count_occ = function(tag) { // counts occ of tag + var n = 0; + for (var i=0; i<this.variable_fields.length; i++) { + if (this.variable_fields[i].tag == tag) { n++; } + } + return n; +} + +MARC_Record.prototype.exists = function(tag) { // field existence + if (this.count_occ(tag) > 0) return true; + return false; +} + +MARC_Record.prototype.MARC_field = function(rec,tag,ind1,ind2,value) { // new MARC field + this.tag = tag; + this.occ = rec.count_occ(tag)+1; // occurrence order no. + this.ind1 = ind1; if (this.ind1 == '''') this.ind1 = '' ''; + this.ind2 = ind2; if (this.ind2 == '''') this.ind2 = '' ''; + if (tag.substr(0,2) == ''00'') { + this.ind1 = ''''; this.ind2 = ''''; + } + this.value = value; + return this; +} + +MARC_Record.prototype.display = function(type) { // displays record in format type + type = type.toLowerCase(); + if (type == ''binary'') return this.show_leader() + + this.directory + + this.field_terminator + + this.show_fields() + + this.record_terminator; + if (type == ''xml'') { + s = ''''; + s += ''<?xml version="1.0" encoding="iso-8859-1"?><collection xmlns="http://www.loc.gov/MARC21/slim"><record>''; + s += ''<leader>''+this.show_leader()+''</leader>''; + // var i; + for (i=0; i<this.variable_fields.length; i++) { + ind1 = this.variable_fields[i].ind1; if (ind1 != '''') ind1 = '' ind1="''+ind1+''"''; + ind2 = this.variable_fields[i].ind2; if (ind2 != '''') ind2 = '' ind2="''+ind2+''"''; + if (this.variable_fields[i].tag.substr(0,2) == ''00'') s += ''<controlfield tag="''+this.variable_fields[i].tag+''">''+this.variable_fields[i].value+''</controlfield>''; + else { + var subfields = this.variable_fields[i].value.split(this.subfield_delimiter); + // alert(this.variable_fields[i].value+'' ''+subfields.length); // test + if (subfields.length == 1) subfields[1] = ''?''+this.variable_fields[i].value; + var sf = ''''; + for (var j=1; j<subfields.length; j++) { + sf += ''<subfield code="''+subfields[j].substr(0,1)+''">''+subfields[j].substr(1)+''</subfield>''; + } + s += ''<datafield tag="'' + this.variable_fields[i].tag + ''"'' + ind1 + ind2 + ''>'' + sf + ''</datafield>''; + } + } + s += ''</record></collection>''; + return s; + } + return false; +} + +MARC_Record.prototype.get_field = function(tag) { // returns an array of values, one for each occurrence + var v = new Array(); var i; + for (i=0; i<this.variable_fields.length; i++) { + if (this.variable_fields[i].tag == tag) { + v[v.length] = this.variable_fields[i].ind1 + + this.variable_fields[i].ind2 + + this.variable_fields[i].value; + } + } + return v; +} + +// This function added by Simon Kornblith +MARC_Record.prototype.get_field_subfields = function(tag) { // returns a two-dimensional array of values + var field = this.get_field(tag); + var return_me = new Array(); + for(var i in field) { + return_me[i] = new Object(); + var subfields = field[i].split(this.subfield_delimiter); + if (subfields.length == 1) { + return_me[i][''?''] = field[i]; + } else { + for (var j=1; j<subfields.length; j++) { + return_me[i][subfields[j].substr(0,1)] = subfields[j].substr(1); + } + } + } + return return_me; +} + +MARC_Record.prototype.add_field = function(tag,ind1,ind2,value) { // adds a field to the record + if (tag.length != 3) { return false; } + var F = new this.MARC_field(this,tag,ind1,ind2,value); + // adds pointer to list of fields + this.variable_fields[this.variable_fields.length] = F; + // adds the entry to the directory + this.directory += F.tag+this._zero_fill(F.ind1.length+F.ind2.length+F.value.length+1,4)+''00000''; + // sorts the directory + this.sort_directory(); + // updates lengths + this.update_base_address_of_data(); + this.update_displacements(); + this.update_record_length(); + return F; +} + +MARC_Record.prototype.delete_field = function(tag,occurrence) { + // lookup and delete the occurrence from array variable_fields + var i; + for (i=0; i<this.variable_fields.length; i++) { + if (this.variable_fields[i].tag == tag && this.variable_fields[i].occ == occurrence) break; + } + if (i==this.variable_fields.length) return false; // campo non trovato + // deletes the occ. i from array variable_fields scaling next values + var j; + for (j=i+1; j<this.variable_fields.length; j++) { + this.variable_fields[i++]=this.variable_fields[j]; + } + this.variable_fields.length--; // deletes last element + // lookup and delete the occurrence from directory (must exist; no sort is needed) + var nocc = 0; + // var i; + for (i=0; i<this.directory.length;i=i+12) { + if (this.directory.substr(i,3) == tag) nocc++; + if (occurrence == nocc) { // occ found + break; + } + } + if (i >= this.directory.length) alert(''Internal error!''); + this.directory = this.directory.substr(0,i) + this.directory.substr(i+12); + // updates lengths + this.update_base_address_of_data(); + this.update_displacements(); + this.update_record_length(); + return true; +} + +MARC_Record.prototype._clean = function(value) { + value = value.replace(/^[\s\.\,\/\:]+/, ''''); + value = value.replace(/[\s\.\,\/\:]+$/, ''''); + value = value.replace(/ +/g, '' ''); + + var char1 = value[1]; + var char2 = value[value.length-1]; + if((char1 == "[" && char2 == "]") || (char1 == "(" && char2 == ")")) { + // chop of extraneous characters + return value.substr(1, value.length-2); + } + + return value; +} + +MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldName, execMe, arg1, arg2) { + if(!part) { + part = ''a''; + } + var field = this.get_field_subfields(fieldNo); + Scholar.Utilities.debugPrint(''Found ''+field.length+'' matches for ''+fieldNo+part); + if(field) { + for(var i in field) { + var value = false; + for(var j=0; j<part.length; j++) { + var myPart = part[j]; + if(field[i][myPart]) { + if(value) { + value += " "+field[i][myPart]; + } else { + value = field[i][myPart]; + } + } + } + if(value) { + value = this._clean(value); + + if(execMe) { + value = execMe(value, arg1, arg2); + } + + // TODO: handle creators better + if(fieldName == "creator") { + item.creators.push(value); + } else { + item[fieldName] = value; + } + } + } + } +} + +MARC_Record.prototype._associateTags = function(item, fieldNo, part) { + var field = this.get_field_subfields(fieldNo); + + for(var i in field) { + for(var j=0; j<part.length; j++) { + var myPart = part[j]; + if(field[i][myPart]) { + item.tags.push(this._clean(field[i][myPart])); + } + } + } +} + +// this function loads a MARC record into our database +MARC_Record.prototype.translate = function(item) { + // cleaning functions - use a closure to improve readability because they''ll + // only be called once per record anyway + function _pullNumber(text) { + var pullRe = /[0-9]+/; + var m = pullRe.exec(text); + if(m) { + return m[0]; + } + } + + function _corpAuthor(author) { + return {lastName:author}; + } + + // not sure why this is necessary, but without it, this code is inaccessible + // from other translators + function _author(author, type, useComma) { + return Scholar.Utilities.cleanAuthor(author, type, useComma); + } + + // Extract ISBNs + this._associateDBField(item, ''020'', ''a'', ''ISBN'', _pullNumber); + // Extract ISSNs + this._associateDBField(item, ''022'', ''a'', ''ISSN'', _pullNumber); + // Extract creators + this._associateDBField(item, ''100'', ''a'', ''creator'', _author, ''author'', true); + this._associateDBField(item, ''110'', ''a'', ''creator'', _corpAuthor, ''author''); + this._associateDBField(item, ''111'', ''a'', ''creator'', _corpAuthor, ''author''); + this._associateDBField(item, ''700'', ''a'', ''creator'', _author, ''contributor'', true); + this._associateDBField(item, ''710'', ''a'', ''creator'', _corpAuthor, ''contributor''); + this._associateDBField(item, ''711'', ''a'', ''creator'', _corpAuthor, ''contributor''); + if(!item.creators.length) { + // some LOC entries have no listed author, but have the author in the person subject field as the first entry + var field = this.get_field_subfields(''600''); + if(field[0]) { + item.creators.push(this.cleanAuthor(field[0][''a''], true)); + } + } + + // Extract tags + // personal + this._associateTags(item, "600", "aqtxyz"); + // corporate + this._associateTags(item, "611", "abtxyz"); + // meeting + this._associateTags(item, "630", "acetxyz"); + // uniform title + this._associateTags(item, "648", "atxyz"); + // chronological + this._associateTags(item, "650", "axyz"); + // topical + this._associateTags(item, "651", "abcxyz"); + // geographic + this._associateTags(item, "653", "axyz"); + // uncontrolled + this._associateTags(item, "653", "a"); + // faceted topical term (whatever that means) + this._associateTags(item, "654", "abcyz"); + // genre/form + this._associateTags(item, "655", "abcxyz"); + // occupation + this._associateTags(item, "656", "axyz"); + // function + this._associateTags(item, "657", "axyz"); + // curriculum objective + this._associateTags(item, "658", "ab"); + // hierarchical geographic place name + this._associateTags(item, "662", "abcdfgh"); + + // Extract title + this._associateDBField(item, ''245'', ''ab'', ''title''); + // Extract edition + this._associateDBField(item, ''250'', ''a'', ''edition''); + // Extract place info + this._associateDBField(item, ''260'', ''a'', ''place''); + // Extract publisher info + this._associateDBField(item, ''260'', ''b'', ''publisher''); + // Extract year + this._associateDBField(item, ''260'', ''c'', ''year'', _pullNumber); + // Extract series + this._associateDBField(item, ''440'', ''a'', ''series''); + // Extract call number + this._associateDBField(item, ''050'', ''ab'', ''callNumber''); + this._associateDBField(item, ''060'', ''ab'', ''callNumber''); + this._associateDBField(item, ''070'', ''ab'', ''callNumber''); + this._associateDBField(item, ''080'', ''ab'', ''callNumber''); + this._associateDBField(item, ''082'', ''a'', ''callNumber''); + this._associateDBField(item, ''084'', ''ab'', ''callNumber''); + + // Set type + item.itemType = "book"; +} + +MARC_Record.prototype._trim = function(s) { // eliminates blanks from both sides + s = s.replace(/\s+$/,''''); + return s.replace(/^\s+/,''''); +} + +MARC_Record.prototype._zero_fill = function(s,l) { // left ''0'' padding of s, up to l (l<=15) + var t = ''000000000000000''; + t = t+s; + return t.substr(t.length-l,l); +} + +function doImport(url) { // the URL is actually here for other translators + var text; + var holdOver = ""; // part of the text held over from the last loop + + while(text = Scholar.read(4096)) { // read in 4096 byte increments + var records = text.split("\x1D"); + Scholar.Utilities.debugPrint(records); + + if(records.length > 1) { + records[0] = holdOver + records[0]; + holdOver = records.pop(); // skip last record, since it''s not done + + for(var i in records) { + var newItem = new Scholar.Item(); + newItem.source = url; + + // create new record + var record = new MARC_Record(); + record.load(records[i], "binary"); + record.translate(newItem); + + newItem.complete(); + } + } else { + holdOver += text; + } } }'); \ No newline at end of file