www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 83c36f330d3fc3712398b2181034e3174d446285
parent 974228338981254f9a3db8c43c8d144701a8c43e
Author: Simon Kornblith <simon@simonster.com>
Date:   Fri, 23 Jun 2006 16:17:53 +0000

Scrapable search results for SIRSI 2003+ scraper


Diffstat:
Mscrapers.sql | 177+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
1 file changed, 114 insertions(+), 63 deletions(-)

diff --git a/scrapers.sql b/scrapers.sql @@ -1,7 +1,7 @@ -- 12 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 10:11:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-23 12:17:00')); REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; @@ -661,7 +661,7 @@ if(newUri) { wait();'); -REPLACE INTO "scrapers" VALUES('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-12 09:58:00', 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', +REPLACE INTO "scrapers" VALUES('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-23 12:17:00', 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi', 'var namespace = doc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; @@ -672,6 +672,12 @@ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); if(elmts.length) { return true; } +var xpath = ''//td[@class="searchsum"]/table''; +var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); +if(elmts.length) { + return true; +} + return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; @@ -683,77 +689,122 @@ var nsResolver = namespace ? function(prefix) { if (prefix == ''x'') return namespace; else return null; } : null; -var uri = doc.location.href; var data = new Object(); -var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]''; -var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); -for (var i = 0; i < elmts.length; i++) { - var elmt = elmts[i]; - try { - var node = utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver); - if(!node) { - var node = utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver); - } - if(node) { - var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue); - field = field.toLowerCase(); - var value = utilities.superCleanString(node.nodeValue); - var rdfUri = null; - if(field == "publisher") { - rdfUri = prefixDC + ''publisher''; - } else if(field == "pub date") { - rdfUri = prefixDC + ''year''; - - var re = /[0-9]+/; - var m = re.exec(value); - value = m[0]; - } else if(field == "isbn") { - rdfUri = prefixDC + ''identifier''; - - var re = /^[0-9](?:[0-9X]+)/; - var m = re.exec(value); - value = m[0]; - } else if(field == "title") { - rdfUri = prefixDC + ''title''; - var titleParts = value.split(" / "); - value = titleParts[0]; - } else if(field == "publication info") { - rdfUri = prefixDummy + ''place''; - var pubParts = value.split(" : "); - value = pubParts[0]; - } else if(field == "personal author") { - rdfUri = prefixDC + ''creator''; - value = utilities.cleanAuthor(node.nodeValue); - } else if(field == "added author") { - rdfUri = prefixDC + ''contributor''; - value = utilities.cleanAuthor(node.nodeValue); - } else if(field == "corporate author") { - rdfUri = prefixDummy + ''corporateCreator''; +function scrape(doc) { + var uri = doc.location.href; + + var xpath = ''//tr[th[@class="viewmarctags"]][td[@class="viewmarctags"]]''; + var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver); + if(!elmts.length) { + return false; + } + for (var i = 0; i < elmts.length; i++) { + var elmt = elmts[i]; + try { + var node = utilities.getNode(doc, elmt, ''./TD[1]/A[1]/text()[1]'', nsResolver); + if(!node) { + var node = utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver); } - if(rdfUri) { - var insert = true; - if(data && data[rdfUri]) { - for(j in data[rdfUri]) { - if(data[rdfUri][j] == value) { - insert = false; - break; + if(node) { + var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TH[1]/text()[1]'', nsResolver).nodeValue); + field = field.toLowerCase(); + var value = utilities.superCleanString(node.nodeValue); + var rdfUri = null; + if(field == "publisher") { + rdfUri = prefixDC + ''publisher''; + } else if(field == "pub date") { + rdfUri = prefixDC + ''year''; + + var re = /[0-9]+/; + var m = re.exec(value); + value = m[0]; + } else if(field == "isbn") { + rdfUri = prefixDC + ''identifier''; + + var re = /^[0-9](?:[0-9X]+)/; + var m = re.exec(value); + value = m[0]; + } else if(field == "title") { + rdfUri = prefixDC + ''title''; + var titleParts = value.split(" / "); + value = titleParts[0]; + } else if(field == "publication info") { + rdfUri = prefixDummy + ''place''; + var pubParts = value.split(" : "); + value = pubParts[0]; + } else if(field == "personal author") { + rdfUri = prefixDC + ''creator''; + value = utilities.cleanAuthor(node.nodeValue); + } else if(field == "added author") { + rdfUri = prefixDC + ''contributor''; + value = utilities.cleanAuthor(node.nodeValue); + } else if(field == "corporate author") { + rdfUri = prefixDummy + ''corporateCreator''; + } + if(rdfUri) { + var insert = true; + if(data && data[rdfUri]) { + for(j in data[rdfUri]) { + if(data[rdfUri][j] == value) { + insert = false; + break; + } } + } else if(!data[rdfUri]) { + data[rdfUri] = new Array(); + } + if(insert) { + data[rdfUri].push(value); + model.addStatement(uri, rdfUri, value, true); } - } else if(!data[rdfUri]) { - data[rdfUri] = new Array(); - } - if(insert) { - data[rdfUri].push(value); - model.addStatement(uri, rdfUri, value, true); } } - } - } catch (e) {} + } catch (e) {} + } + model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); + return true; } -model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); +if(!scrape(doc)) { + var checkboxes = new Array(); + var urls = new Array(); + var availableItems = new Array(); + + var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''//td[@class="searchsum"]/table[//input[@value="Details"]]'', nsResolver); + // Go through table rows + for(var i=1; i<tableRows.length; i++) { + var input = utilities.getNode(doc, tableRows[i], ''.//input[@value="Details"]'', nsResolver); + checkboxes[i] = input.name; + var text = utilities.getNodeString(doc, tableRows[i], ''.//label/strong//text()'', nsResolver); + if(text) { + availableItems[i] = text; + } + } + + var items = utilities.selectItems(availableItems); + + if(!items) { + return true; + } + + var hostRe = new RegExp("^http://[^/]+"); + var m = hostRe.exec(doc.location.href); + var hitlist = doc.forms.namedItem("hitlist"); + var baseUrl = m[0]+hitlist.getAttribute("action")+"?first_hit="+hitlist.elements.namedItem("first_hit").value+"&last_hit="+hitlist.elements.namedItem("last_hit").value; + utilities.debugPrint(baseUrl); + + var uris = new Array(); + for(i in items) { + uris.push(baseUrl+"&"+checkboxes[i]+"=Details"); + } + + utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, + function() { done() }, function() {}); + + wait(); +} '); REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-18 09:58:00', 'ProQuest Scraper', 'Simon Kornblith', 'http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '',