www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 8fe72b3e3cd360e2f75a4a5973c85a605e7c4157
parent 641d7054cc69ad86de6dd264f54429b5386f78fc
Author: Simon Kornblith <simon@simonster.com>
Date:   Fri, 23 Jun 2006 19:22:24 +0000

Search results scraping for VTLS


Diffstat:
Mscrapers.sql | 79+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------
1 file changed, 71 insertions(+), 8 deletions(-)

diff --git a/scrapers.sql b/scrapers.sql @@ -1,7 +1,7 @@ --- 12 +-- 13 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 13:34:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 15:21:00')); REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; @@ -1199,17 +1199,81 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { wait();'); -REPLACE INTO "scrapers" VALUES('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-18 11:19:00', 'VTLS Scraper', 'Simon Kornblith', 'chameleon\?.*function=(?:CARDSCR|INITREQ)', NULL, +REPLACE INTO "scrapers" VALUES('63a0a351-3131-18f4-21aa-f46b9ac51d87', '2006-06-23 15:21:00', 'VTLS Scraper', 'Simon Kornblith', '\/chameleon(?:\?|$)', +'var node = utilities.getNode(doc, doc, ''//a[text()="marc"]'', null); +if(node) { + return true; +} +var node = utilities.getNode(doc, doc, ''//tr[@class="intrRow"]/td/table/tbody/tr[th]'', null); +if(node) { + return true; +} +return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; +var namespace = doc.documentElement.namespaceURI; +var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; +} : null; + var uri = doc.location.href; -var newUri = uri.replace(/function=[A-Z]{7}/, "function=MARCSCR"); +var newUris = new Array(); -utilities.loadDocument(newUri, browser, function(newBrowser) { - newDoc = newBrowser.contentDocument; +var marcs = utilities.gatherElementsOnXPath(doc, doc, ''//a[text()="marc"]'', nsResolver); + +if(marcs.length == 1) { + newUris.push(marcs[0].href) +} else { + // Require link to match this + var tagRegexp = new RegExp(); + tagRegexp.compile("/chameleon\?.*function=CARDSCR"); + + var items = new Array(); + + var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''//tr[@class="intrRow"]'', nsResolver); + // Go through table rows + for(var i=0; i<tableRows.length; i++) { + var links = utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver); + // Go through links + var url; + for(var j=0; j<links.length; j++) { + if(tagRegexp.test(links[j].href)) { + url = links[j].href; + } + } + if(url) { + // Collect title information + var fields = utilities.gatherElementsOnXPath(doc, tableRows[i], ''./td/table/tbody/tr[th]'', nsResolver); + for(var j=0; j<fields.length; j++) { + var field = utilities.getNode(doc, fields[j], ''./th/text()'', nsResolver); + if(field.nodeValue == "Title") { + var value = utilities.getNodeString(doc, fields[j], ''./td//text()'', nsResolver); + if(value) { + items[url] = utilities.cleanString(value); + } + } + } + } + } + + items = utilities.selectItems(items); + + if(!items) { + return true; + } + + for(i in items) { + utilities.debugPrint(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); + newUris.push(i.replace(/function=[A-Z]{7}/, "function=MARCSCR")); + } +} + +utilities.processDocuments(browser, null, newUris, function(newBrowser) { + var newDoc = newBrowser.contentDocument; + var uri = newDoc.location.href var namespace = newDoc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -1231,8 +1295,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { } utilities.importMARCRecord(record, uri, model); - done(); -}, function() {}) +}, function(){ done(); }, function() {}); wait();');