commit a48ea7dabf919ffc74160d8c87004936811ef9eb
parent 7402577806ebec0498d1a1f139e8cc91c30c714b
Author: Simon Kornblith <simon@simonster.com>
Date: Sun, 25 Jun 2006 19:32:49 +0000
Search results scraping for ProQuest
Diffstat:
| M | scrapers.sql | | | 252 | ++++++++++++++++++++++++++++++++++++++++++++++++------------------------------- |
1 file changed, 154 insertions(+), 98 deletions(-)
diff --git a/scrapers.sql b/scrapers.sql
@@ -1,7 +1,7 @@
--- 20
+-- 21
-- Set the following timestamp to the most recent scraper update date
-REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-25 14:33:00'));
+REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-25 15:32:00'));
REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
@@ -660,8 +660,7 @@ if(doc.title == "History Cooperative: Search Results") {
wait();
} else {
scrape(doc);
-}
-');
+}');
REPLACE INTO "scrapers" VALUES('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-23 12:49:00', 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)',
'// First, check to see if the URL alone reveals InnoPAC, since some sites don''t reveal the MARC button
@@ -810,7 +809,7 @@ if(newUri) {
wait();');
-REPLACE INTO "scrapers" VALUES('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-23 12:17:00', 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
+REPLACE INTO "scrapers" VALUES('add7c71c-21f3-ee14-d188-caf9da12728b', '2006-06-25 15:32:00', 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
'var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
@@ -956,7 +955,7 @@ if(!scrape(doc)) {
}
');
-REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-18 09:58:00', 'ProQuest Scraper', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?(?:.*\&)?did=', '',
+REPLACE INTO "scrapers" VALUES('a77690cf-c5d1-8fc4-110f-d1fc765dcf88', '2006-06-18 09:58:00', 'ProQuest Scraper', 'Simon Kornblith', '^http://proquest\.umi\.com/pqdweb\?((?:.*\&)?did=.*&Fmt=[0-9]|(?:.*\&)Fmt=[0-9].*&did=|(?:.*\&)searchInterface=)', '',
'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
@@ -967,111 +966,168 @@ var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
-var uri = doc.location.href;
-var data = new Object();
-
-// Title
-var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="headerBlack"]/strong//text()'';
-var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
-var title = "";
-for (var i = 0; i < elmts.length; i++) {
- var elmt = elmts[i];
- title += elmt.nodeValue;
-}
-if(title) {
- model.addStatement(uri, prefixDC + ''title'', title, true);
-}
-
-// Authors
-var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="textMedium"]/a/em'';
-var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
-for (var i = 0; i < elmts.length; i++) {
- var elmt = elmts[i];
+function scrape(doc) {
+ var uri = doc.location.href;
- // Dirty hack to fix highlighted words
- var xpath = ''.//text()'';
- var author = "";
- var authorElmts = utilities.gatherElementsOnXPath(doc, elmt, xpath, nsResolver);
- for (var j = 0; j < authorElmts.length; j++) {
- var authorElmt = authorElmts[j];
- author += authorElmt.nodeValue;
+ // Title
+ var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="headerBlack"]/strong//text()'';
+ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+ var title = "";
+ for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ title += elmt.nodeValue;
}
- model.addStatement(uri, prefixDC + ''creator'', utilities.cleanAuthor(author), true);
-}
-
-// Other info
-var xpath = ''/html/body/span[@class="textMedium"]/font/table/tbody/tr'';
-var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
-for (var i = 0; i < elmts.length; i++) {
- var elmt = elmts[i];
- var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase();
- if(field == "publication title") {
- var publication = utilities.getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver);
- if(publication.nodeValue) {
- model.addStatement(uri, prefixDummy + ''publication'', utilities.superCleanString(publication.nodeValue), true);
- }
- var place = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
- if(place.nodeValue) {
- model.addStatement(uri, prefixDummy + ''place'', utilities.superCleanString(place.nodeValue), true);
- }
- var date = utilities.getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver);
- if(date.nodeValue) {
- var jsDate = new Date(utilities.superCleanString(date.nodeValue));
- model.addStatement(uri, prefixDC + ''date'', utilities.dateToISO(jsDate), true);
+ if(title) {
+ model.addStatement(uri, prefixDC + ''title'', title, true);
+ }
+
+ // Authors
+ var xpath = ''/html/body/span[@class="textMedium"]/table/tbody/tr/td[@class="textMedium"]/a/em'';
+ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+ for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+
+ // Dirty hack to fix highlighted words
+ var xpath = ''.//text()'';
+ var author = "";
+ var authorElmts = utilities.gatherElementsOnXPath(doc, elmt, xpath, nsResolver);
+ for (var j = 0; j < authorElmts.length; j++) {
+ var authorElmt = authorElmts[j];
+ author += authorElmt.nodeValue;
}
- var moreInfo = utilities.getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver);
- if(moreInfo.nodeValue) {
- moreInfo = utilities.superCleanString(moreInfo.nodeValue);
- var parts = moreInfo.split(";\xA0");
-
- var issueRegexp = /^(\w+)\.(?: |\xA0)?(.+)$/
- var issueInfo = parts[0].split(",\xA0");
- for(j in issueInfo) {
- var m = issueRegexp.exec(issueInfo[j]);
- var info = m[1].toLowerCase();
- if(info == "vol") {
- model.addStatement(uri, prefixDummy + ''volume'', utilities.superCleanString(m[2]), true);
- } else if(info == "iss" || info == "no") {
- model.addStatement(uri, prefixDummy + ''number'', utilities.superCleanString(m[2]), true);
+ model.addStatement(uri, prefixDC + ''creator'', utilities.cleanAuthor(author), true);
+ }
+
+ // Other info
+ var xpath = ''/html/body/span[@class="textMedium"]/font/table/tbody/tr'';
+ var elmts = utilities.gatherElementsOnXPath(doc, doc, xpath, nsResolver);
+ for (var i = 0; i < elmts.length; i++) {
+ var elmt = elmts[i];
+ var field = utilities.superCleanString(utilities.getNode(doc, elmt, ''./TD[1]/text()[1]'', nsResolver).nodeValue).toLowerCase();
+ if(field == "publication title") {
+ var publication = utilities.getNode(doc, elmt, ''./TD[2]/A[1]/text()[1]'', nsResolver);
+ if(publication.nodeValue) {
+ model.addStatement(uri, prefixDummy + ''publication'', utilities.superCleanString(publication.nodeValue), true);
+ }
+ var place = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+ if(place.nodeValue) {
+ model.addStatement(uri, prefixDummy + ''place'', utilities.superCleanString(place.nodeValue), true);
+ }
+ var date = utilities.getNode(doc, elmt, ''./TD[2]/A[2]/text()[1]'', nsResolver);
+ if(date.nodeValue) {
+ date = date.nodeValue;
+ var jsDate = new Date(utilities.superCleanString(date));
+ if(!isNaN(jsDate.valueOf())) {
+ date = utilities.dateToISO(jsDate);
}
+ model.addStatement(uri, prefixDC + ''date'', date, true);
}
- if(parts[1] && utilities.superCleanString(parts[1]).substring(0, 3).toLowerCase() == "pg.") {
- var re = /[0-9\-]+/;
- var m = re.exec(parts[1]);
+ var moreInfo = utilities.getNode(doc, elmt, ''./TD[2]/text()[2]'', nsResolver);
+ if(moreInfo.nodeValue) {
+ moreInfo = utilities.superCleanString(moreInfo.nodeValue);
+ var parts = moreInfo.split(";\xA0");
- if(m) {
- model.addStatement(uri, prefixDummy + ''pages'', m[0], true);
+ var issueRegexp = /^(\w+)\.(?: |\xA0)?(.+)$/
+ var issueInfo = parts[0].split(",\xA0");
+ for(j in issueInfo) {
+ var m = issueRegexp.exec(issueInfo[j]);
+ if(m) {
+ var info = m[1].toLowerCase();
+ if(info == "vol") {
+ model.addStatement(uri, prefixDummy + ''volume'', utilities.superCleanString(m[2]), true);
+ } else if(info == "iss" || info == "no") {
+ model.addStatement(uri, prefixDummy + ''number'', utilities.superCleanString(m[2]), true);
+ }
+ }
+ }
+ if(parts[1] && utilities.superCleanString(parts[1]).substring(0, 3).toLowerCase() == "pg.") {
+ var re = /[0-9\-]+/;
+ var m = re.exec(parts[1]);
+
+ if(m) {
+ model.addStatement(uri, prefixDummy + ''pages'', m[0], true);
+ }
}
}
- }
- } else if(field == "source type") {
- var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
- if(value.nodeValue) {
- value = utilities.superCleanString(value.nodeValue).toLowerCase();
-
- if(value == "periodical") {
- model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false);
- } else if(value == "newspaper") {
- model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaperArticle", false);
- } else {
- model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
+ } else if(field == "source type") {
+ var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+ if(value.nodeValue) {
+ value = utilities.superCleanString(value.nodeValue).toLowerCase();
+ utilities.debugPrint(value);
+
+ if(value.indexOf("periodical") >= 0) {
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "magazineArticle", false);
+ } else if(value.indexOf("newspaper") >= 0) {
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "newspaperArticle", false);
+ } else {
+ model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false);
+ }
}
- }
- } else if(field == "isbn" || field == "issn" || field == "issn/isbn") {
- var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
- if(value) {
- var type;
- value = utilities.superCleanString(value.nodeValue);
- if(value.length == 10 || value.length == 13) {
- type = "ISBN";
- } else if(value.length == 8) {
- type = "ISSN";
+ } else if(field == "isbn" || field == "issn" || field == "issn/isbn") {
+ var value = utilities.getNode(doc, elmt, ''./TD[2]/text()[1]'', nsResolver);
+ if(value) {
+ var type;
+ value = utilities.superCleanString(value.nodeValue);
+ if(value.length == 10 || value.length == 13) {
+ type = "ISBN";
+ } else if(value.length == 8) {
+ type = "ISSN";
+ }
+ if(type) {
+ model.addStatement(uri, prefixDC + "identifier", type+" "+value, false);
+ }
}
- if(type) {
- model.addStatement(uri, prefixDC + "identifier", type+" "+value, false);
+ }
+ }
+}
+
+if(doc.title == "Results") {
+ var items = new Object();
+
+ // Require link to match this
+ var tagRegexp = new RegExp();
+ tagRegexp.compile(''^http://[^/]+/pqdweb\\?((?:.*&)?did=.*&Fmt=[12]|(?:.*&)Fmt=[12].*&did=)'');
+
+ var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr[@class="rowUnMarked"]/td[3][@class="textMedium"]'', nsResolver);
+ // Go through table rows
+ for(var i=0; i<tableRows.length; i++) {
+ var links = utilities.gatherElementsOnXPath(doc, tableRows[i], ''.//a'', nsResolver);
+ // Go through links
+ for(var j=0; j<links.length; j++) {
+ if(tagRegexp.test(links[j].href)) {
+ var text = utilities.getNode(doc, tableRows[i], ''./a[@class="bold"]/text()'', null);
+ if(text && text.nodeValue) {
+ text = utilities.cleanString(text.nodeValue);
+ items[links[j].href] = text;
+ }
+ break;
}
}
}
+ items = utilities.selectItems(items);
+
+ if(!items) {
+ return true;
+ }
+
+ var uris = new Array();
+ for(i in items) {
+ uris.push(i);
+ }
+
+ utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) },
+ function() { done(); }, function() {});
+
+ wait();
+} else {
+ var fmtCheck = /(?:\&|\?)Fmt=([0-9]+)/
+ var m = fmtCheck.exec(doc.location.href);
+ if(m && (m[1] == "1" || m[1] == "2")) {
+ scrape(doc);
+ } else if(m) {
+ utilities.loadDocument(doc.location.href.replace("Fmt="+m[1], "Fmt=1"), browser, function(browser) { scrape(browser.contentDocument); done(); }, function() {});
+ wait();
+ }
}');
REPLACE INTO "scrapers" VALUES('6773a9af-5375-3224-d148-d32793884dec', '2006-06-18 11:19:00', 'InfoTrac Scraper', 'Simon Kornblith', '^http://infotrac-college\.thomsonlearning\.com/itw/infomark/',