commit 1e48189c3bf137fbe12ad9944c13d571b6eb93f5
parent 07dad8fae97d4582f9af8d4d8f8946e8991c37de
Author: Simon Kornblith <simon@simonster.com>
Date: Wed, 7 Jun 2006 17:44:55 +0000
Add SIRSI (old) scraper
Diffstat:
| M | scrapers.sql | | | 207 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++------------------------ |
1 file changed, 144 insertions(+), 63 deletions(-)
diff --git a/scrapers.sql b/scrapers.sql
@@ -1,4 +1,4 @@
--- 2
+-- 3
DELETE FROM scrapers;
INSERT INTO "scrapers" VALUES(1, NULL, NULL, 20060603002000, 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/gp/product/', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
var prefixDC = ''http://purl.org/dc/elements/1.1/'';
@@ -131,63 +131,63 @@ utilities.HTTPUtilities.doPost(newUri, ''exportselect=record&exporttype=plaintex
for(var i=0;i<lines.length;i++) {
match = lineRegexp.exec(lines[i]);
if(match) {
- if(match[1] == ''Title'') {
- var title = match[2];
- if(!lineRegexp.test(lines[i+1])) {
- i++;
- title += '' ''+lines[i];
- }
- if(title.substring(title.length-2) == " /") {
- title = title.substring(0, title.length-2);
- }
- model.addStatement(uri, prefixDC + ''title'', title);
- } else if(match[1] == ''Author(s)'') {
- var authors = match[2].split('';'');
- if(authors) {
- model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(authors[0]));
- for(var j=1; j<authors.length; j+=2) {
- if(authors[j-1].substring(0, 1) == ''('') {
- j++;
- }
- model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(authors[j]));
- }
+ if(match[1] == ''Title'') {
+ var title = match[2];
+ if(!lineRegexp.test(lines[i+1])) {
+ i++;
+ title += '' ''+lines[i];
+ }
+ if(title.substring(title.length-2) == " /") {
+ title = title.substring(0, title.length-2);
+ }
+ model.addStatement(uri, prefixDC + ''title'', title);
+ } else if(match[1] == ''Author(s)'') {
+ var authors = match[2].split('';'');
+ if(authors) {
+ model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(authors[0]));
+ for(var j=1; j<authors.length; j+=2) {
+ if(authors[j-1].substring(0, 1) == ''('') {
+ j++;
+ }
+ model.addStatement(uri, prefixDC + ''creator'', cleanAuthor(authors[j]));
+ }
+ } else {
+ model.addStatement(uri, prefixDC + ''creator'', utilities.trimString(match[2]));
+ }
+ } else if(match[1] == ''Publication'') {
+ // Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it.
+ match[2] = utilities.trimString(match[2]);
+ if(match[2].substring(match[2].length-1) == '','') {
+ match[2] = match[2].substring(0, match[2].length-1);
+ }
+ model.addStatement(uri, prefixDC + ''publisher'', match[2]);
+ } else if(match[1] == ''Language'') {
+ model.addStatement(uri, prefixDC + ''language'', utilities.trimString(match[2]));
+ } else if(match[1] == ''Standard No'') {
+ var identifiers = match[2].split(/ +/);
+ var j=0;
+ while(j<(identifiers.length-1)) {
+ var type = identifiers[j].substring(0, identifiers[j].length-1);
+ var lastChar;
+ var value;
+
+ j++;
+ while(j<identifiers.length && (lastChar = identifiers[j].substring(identifiers[j].length-1)) != '':'') {
+ if(identifiers[j].substring(0, 1) != ''('') {
+ if(lastChar == '';'') {
+ value = identifiers[j].substring(0, identifiers[j].length-1);
} else {
- model.addStatement(uri, prefixDC + ''creator'', utilities.trimString(match[2]));
- }
- } else if(match[1] == ''Publication'') {
- // Don''t even try to deal with this. The WorldCat metadata is of poor enough quality that this isn''t worth it.
- match[2] = utilities.trimString(match[2]);
- if(match[2].substring(match[2].length-1) == '','') {
- match[2] = match[2].substring(0, match[2].length-1);
- }
- model.addStatement(uri, prefixDC + ''publisher'', match[2]);
- } else if(match[1] == ''Language'') {
- model.addStatement(uri, prefixDC + ''language'', utilities.trimString(match[2]));
- } else if(match[1] == ''Standard No'') {
- var identifiers = match[2].split(/ +/);
- var j=0;
- while(j<(identifiers.length-1)) {
- var type = identifiers[j].substring(0, identifiers[j].length-1);
- var lastChar;
- var value;
-
- j++;
- while(j<identifiers.length && (lastChar = identifiers[j].substring(identifiers[j].length-1)) != '':'') {
- if(identifiers[j].substring(0, 1) != ''('') {
- if(lastChar == '';'') {
- value = identifiers[j].substring(0, identifiers[j].length-1);
- } else {
- value = identifiers[j];
- }
- model.addStatement(uri, prefixDC + ''identifier'', type + '' '' + value);
- }
- j++;
- }
+ value = identifiers[j];
}
- } else if(match[1] == ''Year'') {
- model.addStatement(uri, prefixDC + ''year'', match[2]);
+ model.addStatement(uri, prefixDC + ''identifier'', type + '' '' + value);
+ }
+ j++;
}
}
+ } else if(match[1] == ''Year'') {
+ model.addStatement(uri, prefixDC + ''year'', match[2]);
+ }
+ }
}
done();
@@ -459,7 +459,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
wait();');
-INSERT INTO "scrapers" VALUES(7, NULL, NULL, 20060603002000, 'SIRSI Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
+INSERT INTO "scrapers" VALUES(7, NULL, NULL, 20060603002000, 'SIRSI 2003+ Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
'var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
@@ -1097,8 +1097,10 @@ var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
var uri = doc.location.href;
var newUri = uri.replace(/([:&])next=html\/geacnffull.html/, "$1next=html/marc.html");
newUri = newUri.replace(/([:&])next=html\/record.html/, "$1next=html/marc.html");
-
-utilities.debugPrint(newUri);
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
utilities.loadDocument(newUri, browser, function(newBrowser) {
newDoc = newBrowser.contentDocument;
@@ -1107,11 +1109,6 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
var nsResolver = namespace ? function(prefix) {
if (prefix == ''x'') return namespace; else return null;
} : null;
-
- var getNode = function(doc, contextNode, xpath, nsResolver) {
- return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
- }
-
var record = new MARC_Record();
@@ -1123,6 +1120,7 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
if(line.substring(0, 6) == " ") {
content += " "+line.substring(6);
+ continue;
} else {
if(tag) {
record.add_field(tag, ind1, ind2, content);
@@ -1145,11 +1143,94 @@ utilities.loadDocument(newUri, browser, function(newBrowser) {
content = line.substring(4);
}
- utilities.debugPrint(''tag:''+tag+'' ind1:''+ind1+'' ind2:''+ind2+'' content:''+content);
}
model = utilities.importMARCRecord(record, uri, model);
done();
}, function() {});
+wait();');
+
+
+INSERT INTO "scrapers" VALUES(16, NULL, NULL, 20060603002000, 'SIRSI -2003 Scraper', 'Simon Kornblith', '/uhtbin/cgisirsi',
+'var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p/text()[1]'', nsResolver);
+for(i in elmts) {
+ utilities.debugPrint(elmts[i].nodeValue);
+ if(elmts[i].nodeValue == "\n\nViewing record\n") {
+ return true;
+ }
+}
+return false;',
+'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var namespace = doc.documentElement.namespaceURI;
+var nsResolver = namespace ? function(prefix) {
+ if (prefix == ''x'') return namespace; else return null;
+} : null;
+
+var getNode = function(doc, contextNode, xpath, nsResolver) {
+ return doc.evaluate(xpath, contextNode, nsResolver, XPathResult.ANY_TYPE,null).iterateNext();
+}
+
+var uri = doc.location.href;
+var uriRegexp = /^(.*)(\/[0-9]+)$/;
+var m = uriRegexp.exec(uri);
+var newUri = m[1]+"/40";
+utilities.debugPrint(newUri);
+
+var elmts = utilities.gatherElementsOnXPath(doc, doc, ''/html/body/form/p'', nsResolver);
+for(i in elmts) {
+ var elmt = elmts[i];
+ var initialText = getNode(doc, elmt, ''./text()[1]'', nsResolver);
+ if(initialText.nodeValue == "\n\nViewing record\n") {
+ var recNumber = getNode(doc, elmt, ''./b[1]/text()[1]'', nsResolver).nodeValue;
+ }
+}
+
+utilities.HTTPUtilities.doPost(newUri, ''marks=''+recNumber+''&shadow=NO&format=FLAT+ASCII&sort=TITLE&vopt_elst=ALL&library=ALL&display_rule=ASCENDING&duedate_code=l&holdcount_code=t&DOWNLOAD_x=22&DOWNLOAD_y=12&address=&form_type='', null, function(text) {
+ var texts = text.split("<PRE>");
+ texts = texts[1].split("</PRE>");
+ text = texts[0];
+ var lines = text.split("\n");
+
+ var record = new MARC_Record();
+
+ var tag, ind1, ind2, content;
+ for(var i=0; i<lines.length; i++) {
+ var line = lines[i];
+
+ if(line.substr(0, 1) == "." && line.substr(4,2) == ". ") {
+ if(tag) {
+ content = content.replace(/\|([a-z])/g, record.subfield_delimiter+"$1");
+ record.add_field(tag, ind1, ind2, content);
+ }
+ } else {
+ content += " "+line.substring(6);
+ continue;
+ }
+
+ tag = line.substr(1, 3);
+
+ if(parseInt(tag) > 10) {
+ ind1 = line.substr(6, 1);
+ ind2 = line.substr(7, 1);
+ content = line.substr(8);
+ } else {
+ ind1 = "";
+ ind2 = "";
+ content = line.substring(6);
+ }
+ }
+
+ model = utilities.importMARCRecord(record, uri, model);
+ done();
+})
wait();');
\ No newline at end of file