www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 064ecd17db3c8228d3db2e5a2b0d34b14700cd3c
parent 1e8aa81c020db404966c19c06e909326edc83aea
Author: Simon Kornblith <simon@simonster.com>
Date:   Fri, 11 Aug 2006 15:28:18 +0000

removes unnecessary pieces of piggy bank API from utilities and updates translators to abide by current translator guidelines


Diffstat:
Mchrome/chromeFiles/content/scholar/xpcom/utilities.js | 105+++++++++----------------------------------------------------------------------
Mscrapers.sql | 225++++++++++++++++++++++++++++++++++++++-----------------------------------------
2 files changed, 120 insertions(+), 210 deletions(-)

diff --git a/chrome/chromeFiles/content/scholar/xpcom/utilities.js b/chrome/chromeFiles/content/scholar/xpcom/utilities.js @@ -1,61 +1,21 @@ // Scholar for Firefox Utilities -// Utilities based on code taken from Piggy Bank 2.1.1 (BSD-licensed) -// This code is licensed according to the GPL ///////////////////////////////////////////////////////////////// // // Scholar.Utilities // ///////////////////////////////////////////////////////////////// -// Scholar.Utilities class, a set of methods to assist in data -// extraction. Some of the code here was stolen directly from the Piggy Bank -// project. Scholar.Utilities = function () {} -// Adapter for Piggy Bank function to print debug messages; log level is -// fixed at 4 (could change this) -Scholar.Utilities.prototype.debugPrint = function(msg) { +Scholar.Utilities.prototype.debug = function(msg) { Scholar.debug(msg, 4); } -// Appears to trim a string, chopping of newlines/spacing -Scholar.Utilities.prototype.trimString = function(s) { - var i = 0; - var spaceChars = " \n\r\t" + String.fromCharCode(160) /* &nbsp; */; - while (i < s.length) { - var c = s.charAt(i); - if (spaceChars.indexOf(c) < 0) { - break; - } - i++; - } - - s = s.substring(i); - - i = s.length; - while (i > 0) { - var c = s.charAt(i - 1); - if (spaceChars.indexOf(c) < 0) { - break; - } - i--; - } - - return s.substring(0, i); -} - -/* - * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS - * Functions below this point are extensions to the utilities provided by - * Piggy Bank. When used in external code, the repository will need to add - * a function definition when exporting in Piggy Bank format. - */ - /* - * Converts a JavaScript date object to an ISO-style date + * Converts a JavaScript date object to an SQL-style date */ -Scholar.Utilities.prototype.dateToISO = function(jsDate) { +Scholar.Utilities.prototype.dateToSQL = function(jsDate) { var date = ""; var year = jsDate.getFullYear().toString(); var month = (jsDate.getMonth()+1).toString(); @@ -112,7 +72,8 @@ Scholar.Utilities.prototype.cleanAuthor = function(author, type, useComma) { */ Scholar.Utilities.prototype.cleanString = function(s) { s = s.replace(/[ \xA0]+/g, " "); - return this.trimString(s); + s = s.replace(/^\s+/, ""); + return s.replace(/\s+$/, ""); } /* @@ -223,43 +184,6 @@ Scholar.Utilities.Ingester.prototype.gatherElementsOnXPath = function(doc, paren return elmts; } -// Appears to look for links in a document containing a certain substring (kind -// of like getItemArray, only with NO REGEXP FUNCTIONALITY) -Scholar.Utilities.Ingester.prototype.collectURLsWithSubstring = function(doc, substring) { - var urls = []; - var addedURLs = []; - - var aElements = doc.evaluate("//a", doc, null, Components.interfaces.nsIDOMXPathResult.ANY_TYPE,null); - var aElement = aElements.iterateNext(); - while (aElement) { - var href = aElement.href; - if (href.indexOf(substring) >= 0 && !(addedURLs[href])) { - urls.unshift(href); - addedURLs[href] = true; - } - aElement = aElements.iterateNext(); - } - return urls; -} - -// For now, we're going to skip the getLLsFromAddresses function (which gets -// latitude and longitude pairs from a series of addresses, but requires the -// big mess of Java code that is the Piggy Bank server) and the geoHelper -// tools (which rely on getLLsFromAddresses) since these are probably not -// essential components for Scholar and would take a great deal of effort to -// implement. We can, however, always implement them later. - -/* - * BEGIN SCHOLAR FOR FIREFOX EXTENSIONS - */ - -/* - * Gets a given node (assumes only one value) - */ -Scholar.Utilities.Ingester.prototype.getNode = function(doc, contextNode, xpath, nsResolver) { - return doc.evaluate(xpath, contextNode, nsResolver, Components.interfaces.nsIDOMXPathResult.ANY_TYPE, null).iterateNext(); -} - /* * Gets a given node as a string containing all child nodes */ @@ -325,10 +249,6 @@ Scholar.Utilities.Ingester.prototype.parseContextObject = function(co, item) { return Scholar.OpenURL.parseContextObject(co, item); } -/* - * END SCHOLAR FOR FIREFOX EXTENSIONS - */ - // Ingester adapters for Scholar.Utilities.HTTP to handle proxies Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, failed) { @@ -337,11 +257,13 @@ Scholar.Utilities.Ingester.prototype.loadDocument = function(url, succeeded, fai } Scholar.Utilities.HTTP.processDocuments(null, [ url ], succeeded, function() {}, failed); } -Scholar.Utilities.Ingester.prototype.processDocuments = function(firstDoc, urls, processor, done, exception) { - for(i in urls) { - urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]); +Scholar.Utilities.Ingester.prototype.processDocuments = function(urls, processor, done, exception) { + if(this.proxiedURL) { + for(i in urls) { + urls[i] = Scholar.Ingester.ProxyMonitor.properToProxy(urls[i]); + } } - Scholar.Utilities.HTTP.processDocuments(firstDoc, urls, processor, done, exception); + Scholar.Utilities.HTTP.processDocuments(null, urls, processor, done, exception); } Scholar.Utilities.Ingester.HTTPUtilities = function(proxiedURL) { @@ -615,10 +537,7 @@ Scholar.Utilities.HTTP.processDocuments = function(firstDoc, urls, processor, do if(hiddenBrowser.contentDocument.location.href != prevUrl) { // Just in case it fires too many times prevUrl = hiddenBrowser.contentDocument.location.href; try { - var newHiddenBrowser = new Object(); - newHiddenBrowser.contentDocument = hiddenBrowser.contentDocument; - newHiddenBrowser.contentWindow = hiddenBrowser.contentWindow; - processor(newHiddenBrowser); + processor(hiddenBrowser.contentDocument); } catch (e) { Scholar.debug("Scholar.Utilities.Ingester.processDocuments onLoad: " + e, 2); exception(e); diff --git a/scrapers.sql b/scrapers.sql @@ -1,9 +1,9 @@ --- 41 +-- 42 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-08 17:12:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-08-11 11:18:00')); -REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-28 23:08:00', 4, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)', +REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-08-11 11:18:00', 4, 'Amazon.com', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/|s/)', 'function detectWeb(doc, url) { var searchRe = new RegExp(''^http://www\.amazon\.com/(gp/search/|exec/obidos/search-handle-url/|s/)''); if(searchRe.test(doc.location.href)) { @@ -28,7 +28,7 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006 var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; - var author = Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue; + var author = doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; newItem.creators.push(Scholar.Utilities.cleanAuthor(author, "author")); } @@ -40,15 +40,15 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006 for (var i = 0; i < elmts.length; i++) { try { var elmt = elmts[i]; - var attribute = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmt, ''./B[1]/text()[1]'', nsResolver).nodeValue); - if(Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver)) { - var value = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver).nodeValue); + var attribute = Scholar.Utilities.cleanString(doc.evaluate(''./B[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); + if(doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) { + var value = Scholar.Utilities.cleanString(doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); if(attribute == "Publisher:") { if(value.lastIndexOf("(") != -1) { var date = value.substring(value.lastIndexOf("(")+1, value.length-1); jsDate = new Date(date); if(!isNaN(jsDate.valueOf())) { - date = Scholar.Utilities.dateToISO(jsDate); + date = Scholar.Utilities.dateToSQL(jsDate); } newItem.date = date; @@ -74,7 +74,7 @@ REPLACE INTO "translators" VALUES ('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006 var xpath = ''/html/body/table/tbody/tr/td[2]/form/div[@class="buying"]/b[@class="sans"]''; var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); - var title = Scholar.Utilities.cleanString(Scholar.Utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue); + var title = Scholar.Utilities.cleanString(doc.evaluate(''./text()[1]'', elmts[0], nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); if(title.lastIndexOf("(") != -1 && title.lastIndexOf(")") == title.length-1) { title = title.substring(0, title.lastIndexOf("(")-1); } @@ -113,7 +113,7 @@ function doWeb(doc, url) { uris.push(i); } - Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, function() { Scholar.done(); }, function() {}); Scholar.wait(); @@ -122,7 +122,7 @@ function doWeb(doc, url) { } }'); -REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 4, 'WorldCat Scraper', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', +REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006-06-26 16:01:00', 4, 'WorldCat', 'Simon Kornblith', '^http://(?:new)?firstsearch\.oclc\.org/WebZ/', 'function detectWeb(doc, url) { if(doc.title == ''FirstSearch: WorldCat Detailed Record'') { return "book"; @@ -195,7 +195,7 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006 } Scholar.Utilities.HTTPUtilities.doPost(newUri, ''exportselect=''+exportselect+''&exporttype=plaintext'', null, function(text) { - Scholar.Utilities.debugPrint(text); + Scholar.Utilities.debug(text); var lineRegexp = new RegExp(); lineRegexp.compile("^([\\w() ]+): *(.*)$"); @@ -240,17 +240,17 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006 newItem.creators.push(Scholar.Utilities.cleanAuthor(authors[j], "author", true)); } } else { - newItem.creators.push(Scholar.Utilities.trimString(match[2])); + newItem.creators.push(Scholar.Utilities.cleanString(match[2])); } } else if(match[1] == ''Publication'') { // Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it. - match[2] = Scholar.Utilities.trimString(match[2]); + match[2] = Scholar.Utilities.cleanString(match[2]); if(match[2].substring(match[2].length-1) == '','') { match[2] = match[2].substring(0, match[2].length-1); } newItem.publisher = match[2]; /*} else if(match[1] == ''Language'') { - .addStatement(uri, prefixDC + ''language'', Scholar.Utilities.trimString(match[2]));*/ + .addStatement(uri, prefixDC + ''language'', Scholar.Utilities.cleanString(match[2]));*/ } else if(match[1] == ''Standard No'') { var identifiers = match[2].split(/ +/); var j=0; @@ -287,7 +287,7 @@ REPLACE INTO "translators" VALUES ('838d8849-4ffb-9f44-3d0d-aa8a0a079afe', '2006 Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 4, 'LOC/Voyager WebVoyage Scraper', 'Simon Kornblith', 'Pwebrecon\.cgi', +REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006-06-26 21:40:00', 4, 'LOC/Voyager WebVoyage', 'Simon Kornblith', 'Pwebrecon\.cgi', 'function detectWeb(doc, url) { var export_options = doc.forms.namedItem(''frm'').elements.namedItem(''RD'').options; for(var i in export_options) { @@ -335,7 +335,7 @@ REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006 // Go through table rows for(var i=0; i<tableRows.length; i++) { // CHK is what we need to get it all as one file - var input = Scholar.Utilities.getNode(doc, tableRows[i], ''./td/input[@name="CHK"]'', nsResolver); + var input = doc.evaluate(''./td/input[@name="CHK"]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); checkboxes[i] = input.value; var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); urls[i] = links[0].href; @@ -414,7 +414,7 @@ REPLACE INTO "translators" VALUES ('88915634-1af6-c134-0171-56fd198235ed', '2006 Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-26 16:01:00', 4, 'JSTOR Scraper', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)', +REPLACE INTO "translators" VALUES ('d921155f-0186-1684-615c-ca57682ced9b', '2006-06-26 16:01:00', 4, 'JSTOR', 'Simon Kornblith', '^http://www\.jstor\.org/(?:view|browse|search/)', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -484,7 +484,7 @@ function doWeb(doc, url) { // Go through links for(var j=0; j<links.length; j++) { if(tagRegexp.test(links[j].href)) { - var text = Scholar.Utilities.getNode(doc, tableRows[i], ''.//strong/text()'', null); + var text = doc.evaluate(''.//strong/text()'', tableRows[i], null, XPathResult.ANY_TYPE, null).iterateNext(); if(text && text.nodeValue) { text = Scholar.Utilities.cleanString(text.nodeValue); if(availableItems[links[j].href]) { @@ -562,7 +562,7 @@ function doWeb(doc, url) { if(isNaN(date.valueOf())) { newItem.date = fieldContent; } else { - newItem.date = Scholar.Utilities.dateToISO(date); + newItem.date = Scholar.Utilities.dateToSQL(date); } } else if(fieldCode == "PP") { newItem.pages = fieldContent; @@ -589,7 +589,7 @@ function doWeb(doc, url) { Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', +REPLACE INTO "translators" VALUES ('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-26 16:01:00', 4, 'History Cooperative', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', 'function detectWeb(doc, url) { if(doc.title == "History Cooperative: Search Results") { return "multiple"; @@ -647,7 +647,7 @@ function doWeb(doc, url) { uris.push(i); } - Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, function() { Scholar.done(); }, function() {}); Scholar.wait(); @@ -656,7 +656,7 @@ function doWeb(doc, url) { } }'); -REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-08-06 21:45:00', 4, 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', +REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-08-06 21:45:00', 4, 'InnoPAC', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', 'function detectWeb(doc, url) { // First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button var matchRegexp = new RegExp(''^(http://[^/]+/search/[^/]+/[^/]+/1\%2C[^/]+/)frameset(.+)$''); @@ -709,9 +709,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); if(newUri) { // single page - Scholar.Utilities.loadDocument(newUri, function(newBrowser) { - newDoc = newBrowser.contentDocument; - + Scholar.Utilities.loadDocument(newUri, function(newDoc) { var namespace = newDoc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; @@ -720,7 +718,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 var xpath = ''//pre''; var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); - var text = Scholar.Utilities.getNode(doc, elmts[0], ''./text()[1]'', nsResolver).nodeValue; + var text = doc.evaluate(''./text()[1]'', elmts[0], nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; var newItem = new Scholar.Item(); newItem.source = uri; @@ -781,7 +779,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 // Go through table rows for(var i=0; i<tableRows.length; i++) { // CHK is what we need to get it all as one file - var input = Scholar.Utilities.getNode(doc, tableRows[i], ''./td/input[@type="checkbox"]'', nsResolver); + var input = doc.evaluate(''./td/input[@type="checkbox"]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); checkboxes[i] = input.name+"="+escape(input.value); var links = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); urls[i] = links[0].href; @@ -836,7 +834,7 @@ REPLACE INTO "translators" VALUES ('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006 Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-26 16:01:00', 4, 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', +REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-26 16:01:00', 4, 'SIRSI 2003+', 'Simon Kornblith', '/uhtbin/cgisirsi', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -872,12 +870,12 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006 for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; try { - var node = Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver); + var node = doc.evaluate(''./TD[1]/A[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(!node) { - var node = Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver); + var node = doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); } if(node) { - var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue); + var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TH[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); field = field.toLowerCase(); var value = Scholar.Utilities.superCleanString(node.nodeValue); if(field == "publisher") { @@ -907,7 +905,7 @@ REPLACE INTO "translators" VALUES ('add7c71c-21f3-ee14-d188-caf9da12728b', '2006 } catch (e) {} } - var callNumber = Scholar.Utilities.getNode(doc, doc, ''//tr/td[1][@class="holdingslist"]/text()'', nsResolver); + var callNumber = doc.evaluate(''//tr/td[1][@class="holdingslist"]/text()'', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(callNumber && callNumber.nodeValue) { newItem.callNumber = callNumber.nodeValue; } @@ -930,7 +928,7 @@ function doWeb(doc, url) { var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//td[@class="searchsum"]/table[//input[@value="Details"]]'', nsResolver); // Go through table rows for(var i=1; i<tableRows.length; i++) { - var input = Scholar.Utilities.getNode(doc, tableRows[i], ''.//input[@value="Details"]'', nsResolver); + var input = doc.evaluate(''.//input[@value="Details"]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); checkboxes[i] = input.name; var text = Scholar.Utilities.getNodeString(doc, tableRows[i], ''.//label/strong//text()'', nsResolver); if(text) { @@ -948,14 +946,14 @@ function doWeb(doc, url) { var m = hostRe.exec(doc.location.href); var hitlist = doc.forms.namedItem("hitlist"); var baseUrl = m[0]+hitlist.getAttribute("action")+"?first_hit="+hitlist.elements.namedItem("first_hit").value+"&last_hit="+hitlist.elements.namedItem("last_hit").value; - Scholar.Utilities.debugPrint(baseUrl); + Scholar.Utilities.debug(baseUrl); var uris = new Array(); for(var i in items) { uris.push(baseUrl+"&"+checkboxes[i]+"=Details"); } - Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, function() { Scholar.done() }, function() {}); Scholar.wait(); @@ -963,7 +961,7 @@ function doWeb(doc, url) { } '); -REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest Scraper', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)', +REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-26 16:01:00', 4, 'ProQuest', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)', 'function detectWeb(doc, url) { if(doc.title == "Results") { return "magazineArticle"; @@ -1010,29 +1008,29 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006 var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); for (var i = 0; i < elmts.length; i++) { var elmt = elmts[i]; - var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase(); + var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue).toLowerCase(); if(field == "publication title") { - var publication = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver); + var publication = doc.evaluate(''./TD[2]/A[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(publication.nodeValue) { newItem.publicationTitle = Scholar.Utilities.superCleanString(publication.nodeValue); } - var place = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + var place = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(place.nodeValue) { newItem.place = Scholar.Utilities.superCleanString(place.nodeValue); } - var date = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver); + var date = doc.evaluate(''./TD[2]/A[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(date.nodeValue) { date = date.nodeValue; var jsDate = new Date(Scholar.Utilities.superCleanString(date)); if(!isNaN(jsDate.valueOf())) { - date = Scholar.Utilities.dateToISO(jsDate); + date = Scholar.Utilities.dateToSQL(jsDate); } newItem.date = date; } - var moreInfo = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver); + var moreInfo = doc.evaluate(''./TD[2]/text()[2]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(moreInfo.nodeValue) { moreInfo = Scholar.Utilities.superCleanString(moreInfo.nodeValue); var parts = moreInfo.split(";\xA0"); @@ -1060,10 +1058,10 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006 } } } else if(field == "source type") { - var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + var value = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(value.nodeValue) { value = Scholar.Utilities.superCleanString(value.nodeValue).toLowerCase(); - Scholar.Utilities.debugPrint(value); + Scholar.Utilities.debug(value); if(value.indexOf("periodical") >= 0) { newItem.itemType = "magazineArticle"; @@ -1074,7 +1072,7 @@ REPLACE INTO "translators" VALUES ('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006 } } } else if(field == "isbn" || field == "issn" || field == "issn/isbn") { - var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver); + var value = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(value) { var type; value = Scholar.Utilities.superCleanString(value.nodeValue); @@ -1110,7 +1108,7 @@ function doWeb(doc, url) { // Go through links for(var j=0; j<links.length; j++) { if(tagRegexp.test(links[j].href)) { - var text = Scholar.Utilities.getNode(doc, tableRows[i], ''./a[@class="bold"]/text()'', null); + var text = doc.evaluate(''./a[@class="bold"]/text()'', tableRows[i], null, XPathResult.ANY_TYPE, null).iterateNext(); if(text && text.nodeValue) { text = Scholar.Utilities.cleanString(text.nodeValue); items[links[j].href] = text; @@ -1130,7 +1128,7 @@ function doWeb(doc, url) { uris.push(i); } - Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, function() { Scholar.done(); }, function() {}); Scholar.wait(); @@ -1140,13 +1138,13 @@ function doWeb(doc, url) { if(m && (m[1] == "1" || m[1] == "2")) { scrape(doc); } else if(m) { - Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(browser) { scrape(browser.contentDocument); Scholar.done(); }, function() {}); + Scholar.Utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), function(doc) { scrape(doc); Scholar.done(); }, function() {}); Scholar.wait(); } } }'); -REPLACE INTO "translators" VALUES ('6773a9af-5375-3224-d148-d32793884dec', '2006-06-26 16:01:00', 4, 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', +REPLACE INTO "translators" VALUES ('6773a9af-5375-3224-d148-d32793884dec', '2006-06-26 16:01:00', 4, 'InfoTrac', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/', 'function detectWeb(doc, url) { if(doc.title.substring(0, 8) == "Article ") { return "magazineArticle"; @@ -1249,9 +1247,9 @@ function doWeb(doc, url) { var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body//table/tbody/tr/td[a/b]'', nsResolver); // Go through table rows for(var i=0; i<tableRows.length; i++) { - var link = Scholar.Utilities.getNode(doc, tableRows[i], ''./a'', nsResolver); + var link = doc.evaluate(''./a'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); uris[i] = link.href; - var article = Scholar.Utilities.getNode(doc, link, ''./b/text()'', nsResolver); + var article = doc.evaluate(''./b/text()'', link, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); items[i] = article.nodeValue; // Chop off final period if(items[i].substr(items[i].length-1) == ".") { @@ -1272,7 +1270,7 @@ function doWeb(doc, url) { } }'); -REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-26 16:01:00', 4, 'LexisNexis Scraper', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/(?:document|doclist)', +REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006-06-26 16:01:00', 4, 'LexisNexis', 'Simon Kornblith', '^http://web\.lexis-nexis\.com/universe/(?:document|doclist)', 'function detectWeb(doc, url) { var detailRe = new RegExp("^http://[^/]+/universe/document"); if(detailRe.test(doc.location.href)) { @@ -1302,7 +1300,7 @@ REPLACE INTO "translators" VALUES ('b047a13c-fe5c-6604-c997-bef15e502b09', '2006 var m = dateRegexp.exec(centerElements[centerElements.length-1].innerHTML); if(m) { var jsDate = new Date(m[1]+" "+m[2]); - newItem.date = Scholar.Utilities.dateToISO(jsDate); + newItem.date = Scholar.Utilities.dateToSQL(jsDate); } else { var elementParts = centerElements[centerElements.length-1].innerHTML.split(/<br[^>]*>/gi); newItem.date = elementParts[1]; @@ -1369,14 +1367,14 @@ function doWeb(doc, url) { uris.push(i); } - Scholar.Utilities.processDocuments(null, uris, function(browser) { scrape(browser.contentDocument) }, + Scholar.Utilities.processDocuments(uris, function(doc) { scrape(doc) }, function() { Scholar.done(); }, function() {}); Scholar.wait(); } }'); -REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 4, 'Aleph Scraper', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)', +REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006-06-26 16:01:00', 4, 'Aleph', 'Simon Kornblith', '^http://[^/]+/F(?:/[A-Z0-9\-]+(?:\?.*)?$|\?func=find)', 'function detectWeb(doc, url) { var singleRe = new RegExp("^http://[^/]+/F/[A-Z0-9\-]+\?.*func=full-set-set.*\&format=[0-9]{3}"); @@ -1425,8 +1423,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006 } var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; + Scholar.Utilities.processDocuments(newUris, function(newDoc) { var uri = newDoc.location.href; var namespace = newDoc.documentElement.namespaceURI; @@ -1440,7 +1437,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006 var record = new marc.MARC_Record(); for(var i=0; i<elmts.length; i++) { var elmt = elmts[i]; - var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue); + var field = Scholar.Utilities.superCleanString(doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); var value = Scholar.Utilities.getNodeString(doc, elmt, ''./TD[2]//text()'', nsResolver); var value = value.replace(/\|([a-z]) /g, record.subfield_delimiter+"$1"); @@ -1467,7 +1464,7 @@ REPLACE INTO "translators" VALUES ('cf87eca8-041d-b954-795a-2d86348999d5', '2006 Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 4, 'Dynix Scraper', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)', +REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006-06-26 16:01:00', 4, 'Dynix', 'Simon Kornblith', 'ipac\.jsp\?.*(?:uri=full=[0-9]|menu=search)', 'function detectWeb(doc, url) { var detailsRe = new RegExp(''ipac\.jsp\?.*uri=full=[0-9]''); if(detailsRe.test(doc.location.href)) { @@ -1511,8 +1508,7 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006 var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - Scholar.Utilities.processDocuments(null, uris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; + Scholar.Utilities.processDocuments(uris, function(newDoc) { var uri = newDoc.location.href; var namespace = newDoc.documentElement.namespaceURI; @@ -1526,7 +1522,7 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006 var record = new marc.MARC_Record(); for(var i=0; i<elmts.length; i++) { var elmt = elmts[i]; - var field = Scholar.Utilities.superCleanString(Scholar.Utilities.getNode(newDoc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver).nodeValue); + var field = Scholar.Utilities.superCleanString(newDoc.evaluate(''./TD[1]/A[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); var value = Scholar.Utilities.getNodeString(newDoc, elmt, ''./TD[2]/TABLE[1]/TBODY[1]/TR[1]/TD[1]/A[1]//text()'', nsResolver); value = value.replace(/\$([a-z]) /g, record.subfield_delimiter+"$1"); @@ -1555,13 +1551,13 @@ REPLACE INTO "translators" VALUES ('774d7dc2-3474-2684-392c-f787789ec63d', '2006 Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-26 16:01:00', 4, 'VTLS Scraper', 'Simon Kornblith', '/chameleon(?:\?|$)', +REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-26 16:01:00', 4, 'VTLS', 'Simon Kornblith', '/chameleon(?:\?|$)', 'function detectWeb(doc, url) { - var node = Scholar.Utilities.getNode(doc, doc, ''//tr[@class="intrRow"]/td/table/tbody/tr[th]'', null); + var node = doc.evaluate(''//tr[@class="intrRow"]/td/table/tbody/tr[th]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext(); if(node) { return "multiple"; } - var node = Scholar.Utilities.getNode(doc, doc, ''//a[text()="marc"]'', null); + var node = doc.evaluate(''//a[text()="marc"]'', doc, null, XPathResult.ANY_TYPE, null).iterateNext(); if(node) { return "book"; } @@ -1602,7 +1598,7 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006 // Collect title information var fields = Scholar.Utilities.gatherElementsOnXPath(doc, tableRows[i], ''./td/table/tbody/tr[th]'', nsResolver); for(var j=0; j<fields.length; j++) { - var field = Scholar.Utilities.getNode(doc, fields[j], ''./th/text()'', nsResolver); + var field = doc.evaluate(''./th/text()'', fields[j], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(field.nodeValue == "Title") { var value = Scholar.Utilities.getNodeString(doc, fields[j], ''./td//text()'', nsResolver); if(value) { @@ -1620,15 +1616,14 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006 } for(var i in items) { - Scholar.Utilities.debugPrint(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); + Scholar.Utilities.debug(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); newUris.push(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); } } var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; + Scholar.Utilities.processDocuments(newUris, function(newDoc) { var uri = newDoc.location.href var namespace = newDoc.documentElement.namespaceURI; @@ -1641,10 +1636,10 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006 var record = new marc.MARC_Record(); for(var i=0; i<elmts.length; i++) { var elmt = elmts[i]; - var field = Scholar.Utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue; - var ind1 = Scholar.Utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver).nodeValue; - var ind2 = Scholar.Utilities.getNode(doc, elmt, ''./TD[3]/text()[1]'', nsResolver).nodeValue; - var value = Scholar.Utilities.getNode(doc, elmt, ''./TD[4]/text()[1]'', nsResolver).nodeValue; + var field = doc.evaluate(''./TD[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; + var ind1 = doc.evaluate(''./TD[2]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; + var ind2 = doc.evaluate(''./TD[3]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; + var value = doc.evaluate(''./TD[4]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; value = value.replace(/\\([a-z]) /g, record.subfield_delimiter+"$1"); record.add_field(field, ind1, ind2, value); @@ -1659,7 +1654,7 @@ REPLACE INTO "translators" VALUES ('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006 Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-26 16:01:00', 4, 'DRA Scraper', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)', +REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006-06-26 16:01:00', 4, 'DRA', 'Simon Kornblith', '/web2/tramp2\.exe/(?:see\_record/|authority\_hits/|goto/.*\?.*screen=Record\.html)', 'function detectWeb(doc, url) { if(doc.location.href.indexOf("/authority_hits") > 0) { return "multiple"; @@ -1729,7 +1724,7 @@ REPLACE INTO "translators" VALUES ('fb12ae9e-f473-cab4-0546-27ab88c64101', '2006 }'); -REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC Scraper', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', +REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006-06-26 16:01:00', 4, 'GEAC', 'Simon Kornblith', '/(?:GeacQUERY|(?:Geac)?FETCH[\:\?].*[&:]next=html/(?:record\.html|geacnffull\.html))', 'function detectWeb(doc, url) { if(doc.location.href.indexOf("/GeacQUERY") > 0) { return "multiple"; @@ -1764,8 +1759,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006 var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - Scholar.Utilities.processDocuments(null, uris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; + Scholar.Utilities.processDocuments(uris, function(newDoc) { var uri = newDoc.location.href; var namespace = newDoc.documentElement.namespaceURI; @@ -1817,7 +1811,7 @@ REPLACE INTO "translators" VALUES ('c0e6fda6-0ecd-e4f4-39ca-37a4de436e15', '2006 Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 4, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', +REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006-06-26 16:01:00', 4, 'SIRSI -2003', 'Simon Kornblith', '/uhtbin/cgisirsi', 'function detectWeb(doc, url) { var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -1870,7 +1864,7 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006 var links = Scholar.Utilities.gatherElementsOnXPath(doc, elmts[i], ''.//a'', nsResolver); // Collect title - var myTd = Scholar.Utilities.getNode(doc, elmts[i], "./td[2]", nsResolver); + var myTd = doc.evaluate("./td[2]", elmts[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); var m = titleRe.exec(myTd.innerHTML); var title = unescapeHTML(m[1]); @@ -1895,9 +1889,9 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006 var elmts = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver); for(var i=0; i<elmts.length; i++) { var elmt = elmts[i]; - var initialText = Scholar.Utilities.getNode(doc, elmt, ''./text()[1]'', nsResolver); + var initialText = doc.evaluate(''./text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(initialText && initialText.nodeValue && Scholar.Utilities.superCleanString(initialText.nodeValue) == "Viewing record") { - recNumbers.push(Scholar.Utilities.getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue); + recNumbers.push(doc.evaluate(''./b[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue); break; } } @@ -1953,7 +1947,7 @@ REPLACE INTO "translators" VALUES ('5287d20c-8a13-6004-4dcb-5bb2b66a9cc9', '2006 Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 4, 'TLC/YouSeeMore Scraper', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)', +REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006-06-26 16:01:00', 4, 'TLC/YouSeeMore', 'Simon Kornblith', 'TLCScripts/interpac\.dll\?(?:.*LabelDisplay.*RecordNumber=[0-9]|Search|ItemTitles)', 'function detectWeb(doc, url) { var detailRe = new RegExp("TLCScripts/interpac\.dll\?.*LabelDisplay.*RecordNumber=[0-9]"); if(detailRe.test(doc.location.href)) { @@ -1989,8 +1983,7 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006 var marc = Scholar.loadTranslator("import", "a6ee60df-1ddc-4aae-bb25-45e0537be973"); - Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; + Scholar.Utilities.processDocuments(newUris, function(newDoc) { var uri = newDoc.location.href; var namespace = newDoc.documentElement.namespaceURI; @@ -2006,8 +1999,8 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006 for(var i=0; i<elmts.length; i++) { var elmt = elmts[i]; - tag = Scholar.Utilities.getNode(newDoc, elmt, ''./td[2]/tt[1]/text()[1]'', nsResolver).nodeValue; - var inds = Scholar.Utilities.getNode(newDoc, elmt, ''./td[3]/tt[1]/text()[1]'', nsResolver).nodeValue; + tag = newDoc.evaluate(''./td[2]/tt[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; + var inds = newDoc.evaluate(''./td[3]/tt[1]/text()[1]'', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue; tag = tag.replace(/[\r\n]/g, ""); if(tag.length == 1) { @@ -2051,7 +2044,7 @@ REPLACE INTO "translators" VALUES ('0f9fc2fc-306e-5204-1117-25bca009dffc', '2006 Scholar.wait(); }'); -REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-26 16:01:00', 4, 'Project MUSE Scraper', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)', +REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006-06-26 16:01:00', 4, 'Project MUSE', 'Simon Kornblith', '^http://muse\.jhu\.edu/(?:journals/[^/]+/[^/]+/[^/]+\.html|search/pia.cgi)', 'function detectWeb(doc, url) { var searchRe = new RegExp("^http://[^/]+/search/pia\.cgi"); if(searchRe.test(url)) { @@ -2073,8 +2066,8 @@ REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006 // Go through table rows for(var i=0; i<tableRows.length; i++) { // article_id is what we need to get it all as one file - var input = Scholar.Utilities.getNode(doc, tableRows[i], ''./tbody/tr/td/input[@name="article_id"]'', nsResolver); - var link = Scholar.Utilities.getNode(doc, tableRows[i], ''.//b/i/a/text()'', nsResolver); + var input = doc.evaluate(''./tbody/tr/td/input[@name="article_id"]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + var link = doc.evaluate(''.//b/i/a/text()'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(input && input.value && link && link.nodeValue) { items[input.value] = link.nodeValue; } @@ -2163,7 +2156,7 @@ REPLACE INTO "translators" VALUES ('c54d1932-73ce-dfd4-a943-109380e06574', '2006 } }'); -REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-08-07 21:55:00', 12, 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', +REPLACE INTO "translators" VALUES ('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-08-07 21:55:00', 12, 'PubMed', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', 'function detectWeb(doc, url) { if(doc.location.href.indexOf("list_uids=") >= 0) { return "journalArticle"; @@ -2239,7 +2232,7 @@ function detectSearch(item) { var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Day.text()+", "+article.Journal.JournalIssue.PubDate.Year.text(); var jsDate = new Date(date); if(!isNaN(jsDate.valueOf())) { - date = Scholar.Utilities.dateToISO(jsDate); + date = Scholar.Utilities.dateToSQL(jsDate); } } else if(article.Journal.JournalIssue.PubDate.Month.text().toString() != "") { var date = article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Year.text(); @@ -2293,8 +2286,8 @@ function doWeb(doc, url) { var tableRows = Scholar.Utilities.gatherElementsOnXPath(doc, doc, ''//div[@class="ResultSet"]/table/tbody'', nsResolver); // Go through table rows for(var i=0; i<tableRows.length; i++) { - var link = Scholar.Utilities.getNode(doc, tableRows[i], ''.//a'', nsResolver); - var article = Scholar.Utilities.getNode(doc, tableRows[i], ''./tr[2]/td[2]/text()[1]'', nsResolver); + var link = doc.evaluate(''.//a'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + var article = doc.evaluate(''./tr[2]/td[2]/text()[1]'', tableRows[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); items[link.href] = article.nodeValue; } @@ -2318,7 +2311,7 @@ function doSearch(item) { lookupPMIDs([getPMID(item.contextObject)]); }'); -REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:41:00', 4, 'Embedded RDF Scraper', 'Simon Kornblith', NULL, +REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-26 16:41:00', 4, 'Embedded RDF', 'Simon Kornblith', NULL, 'function detectWeb(doc, url) { var metaTags = doc.getElementsByTagName("meta"); @@ -2347,7 +2340,7 @@ REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006 foundTitle = true; } translator.Scholar.RDF.addStatement(url, dc + tag.substr(3), value, true); - Scholar.Utilities.debugPrint(tag.substr(3) + " = " + value); + Scholar.Utilities.debug(tag.substr(3) + " = " + value); } else if(tag && value && (tag == "author" || tag == "author-personal")) { translator.Scholar.RDF.addStatement(url, dc + "creator", value, true); } else if(tag && value && tag == "author-corporate") { @@ -2362,7 +2355,7 @@ REPLACE INTO "translators" VALUES ('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006 translator.doImport(); }'); -REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006-08-07 01:09:00', 4, 'COinS Scraper', 'Simon Kornblith', NULL, +REPLACE INTO "translators" VALUES ('05d07af9-105a-4572-99f6-a8e231c0daef', '2006-08-07 01:09:00', 4, 'COinS', 'Simon Kornblith', NULL, 'function detectWeb(doc, url) { var spanTags = doc.getElementsByTagName("span"); @@ -2413,7 +2406,7 @@ function retrieveNextCOinS(needFullItems, newItems) { if(needFullItems.length) { var item = needFullItems.shift(); - Scholar.Utilities.debugPrint("looking up contextObject"); + Scholar.Utilities.debug("looking up contextObject"); var search = Scholar.loadTranslator("search"); search.setHandler("itemDone", function(obj, item) { newItems.push(item); @@ -2490,7 +2483,7 @@ function doWeb(doc, url) { } }'); -REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 4, 'Google Books Scraper', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)', +REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-26 16:01:00', 4, 'Google Books', 'Simon Kornblith', '^http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)', 'function detectWeb(doc, url) { var re = new RegExp(''^http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i''); if(re.test(doc.location.href)) { @@ -2526,8 +2519,7 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006 } } - Scholar.Utilities.processDocuments(null, newUris, function(newBrowser) { - var newDoc = newBrowser.contentDocument; + Scholar.Utilities.processDocuments(newUris, function(newDoc) { var newItem = new Scholar.Item("book"); newItem.source = newDoc.location.href; @@ -2539,8 +2531,8 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006 var xpath = ''//table[@id="bib"]/tbody/tr''; var elmts = Scholar.Utilities.gatherElementsOnXPath(newDoc, newDoc, xpath, nsResolver); for(var i = 0; i<elmts.length; i++) { - var field = Scholar.Utilities.getNode(newDoc, elmts[i], ''./td[1]//text()'', nsResolver); - var value = Scholar.Utilities.getNode(newDoc, elmts[i], ''./td[2]//text()'', nsResolver); + var field = newDoc.evaluate(''./td[1]//text()'', elmts[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); + var value = newDoc.evaluate(''./td[2]//text()'', elmts[i], nsResolver, XPathResult.ANY_TYPE, null).iterateNext(); if(field && value) { field = Scholar.Utilities.superCleanString(field.nodeValue); @@ -2564,7 +2556,7 @@ REPLACE INTO "translators" VALUES ('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006 jsDate = new Date(value); if(!isNaN(jsDate.valueOf())) { - date = Scholar.Utilities.dateToISO(jsDate); + date = Scholar.Utilities.dateToSQL(jsDate); } newItem.date = date; @@ -2618,8 +2610,7 @@ function doSearch(item) { var co = Scholar.Utilities.createContextObject(item); } - Scholar.Utilities.processDocuments(null, ["http://partneraccess.oclc.org/wcpa/servlet/OpenUrl?"+co], function(browser) { - var doc = browser.contentDocument; + Scholar.Utilities.loadDocument("http://partneraccess.oclc.org/wcpa/servlet/OpenUrl?"+co, function(doc) { // find new COinS in the Open WorldCat page if(processOWC(doc)) { // we got a single item page Scholar.done(); @@ -2644,16 +2635,16 @@ function doSearch(item) { urlsToProcess.push(elmt.href); } while(elmt = elmts.iterateNext()); - Scholar.Utilities.processDocuments(null, urlsToProcess, function(browser) { + Scholar.Utilities.processDocuments(urlsToProcess, function(doc) { // per URL - processOWC(browser.contentDocument); + processOWC(doc); }, function() { // done Scholar.done(); }, function() { // error Scholar.done(false); }); } - }, null, function() { + }, function() { error(); }); @@ -3031,7 +3022,7 @@ function doImport() { while(read = Scholar.read(16384)) { text += read; } - Scholar.Utilities.debugPrint("read in"); + Scholar.Utilities.debug("read in"); // eliminate <?xml ?> heading so we can parse as XML text = text.replace(/<\?xml[^?]+\?>/, ""); @@ -3043,9 +3034,9 @@ function doImport() { var xml = new XML(text); for each(var mods in xml.m::mods) { - Scholar.Utilities.debugPrint("item is: "); + Scholar.Utilities.debug("item is: "); for(var i in mods) { - Scholar.Utilities.debugPrint(i+" = "+mods[i].toString()); + Scholar.Utilities.debug(i+" = "+mods[i].toString()); } var newItem = new Scholar.Item(); @@ -3863,8 +3854,8 @@ function doImport() { } else if(type == n.bib+"Memo") { // check to see if this note is independent var arcs = Scholar.RDF.getArcsIn(node); - Scholar.Utilities.debugPrint("working on a note"); - Scholar.Utilities.debugPrint(arcs); + Scholar.Utilities.debug("working on a note"); + Scholar.Utilities.debug(arcs); var skip = false; for each(var arc in arcs) { arc = Scholar.RDF.getResourceURI(arc); @@ -4239,7 +4230,7 @@ function doImport() { var tag = data = false; do { // first valid line is type line = Scholar.read(); - Scholar.Utilities.debugPrint(line); + Scholar.Utilities.debug(line); } while(line !== false && line.substr(0, 6) != "TY - "); var item = new Scholar.Item(); @@ -4258,7 +4249,7 @@ function doImport() { tag = line.substr(0,2); data = line.substr(6); - Scholar.Utilities.debugPrint("tag: ''"+tag+"''; data: ''"+data+"''"); + Scholar.Utilities.debug("tag: ''"+tag+"''; data: ''"+data+"''"); if(tag == "ER") { // ER signals end of reference // unset info @@ -4692,7 +4683,7 @@ MARC_Record.prototype._associateDBField = function(item, fieldNo, part, fieldNam part = ''a''; } var field = this.get_field_subfields(fieldNo); - Scholar.Utilities.debugPrint(''Found ''+field.length+'' matches for ''+fieldNo+part); + Scholar.Utilities.debug(''Found ''+field.length+'' matches for ''+fieldNo+part); if(field) { for(var i in field) { var value = false;