www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 7402577806ebec0498d1a1f139e8cc91c30c714b
parent a9c79f61106ba37a9ba2cb890d56a0b875711070
Author: Simon Kornblith <simon@simonster.com>
Date:   Sun, 25 Jun 2006 18:34:23 +0000

Add search results scraping for History Cooperative


Diffstat:
Mscrapers.sql | 82+++++++++++++++++++++++++++++++++++++++++++++----------------------------------
1 file changed, 47 insertions(+), 35 deletions(-)

diff --git a/scrapers.sql b/scrapers.sql @@ -1,7 +1,7 @@ --- 19 +-- 20 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-25 14:16:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-25 14:33:00')); REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; @@ -468,18 +468,14 @@ if(doc.title == "JSTOR: Search Results") { function getList(urls, each, done, error) { var url = urls.shift(); - utilities.debugPrint("fetching "+url); utilities.HTTPUtilities.doGet(url, null, function(text) { - utilities.debugPrint("got "+url); if(each) { each(text); } if(urls.length) { - utilities.debugPrint("still more"); getList(urls, each, done, error); } else if(done) { - utilities.debugPrint("done"); done(text); } }, error); @@ -503,7 +499,6 @@ function newDataObject() { utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse?citationAction=removeAll&confirmRemAll=on&viewCitations=1'', null, function() { // clear marked // Mark all our citations getList(saveCitations, null, function() { // mark this - utilities.debugPrint("getting citations"); utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse/citations.txt?exportAction=Save+as+Text+File&exportFormat=cm&viewCitations=1'', null, function(text) { // get marked var k = 0; @@ -517,7 +512,6 @@ utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse?citationAction=remov if(lines[i].substring(0,3) == "<1>") { haveStarted = true; } else if(newItemRe.test(lines[i])) { - utilities.debugPrint("new item!"); if(!stableURL) { if(ISSN) { stableURL = "http://www.jstor.org/browse/"+ISSN; @@ -608,44 +602,65 @@ utilities.HTTPUtilities.doGet(''http://www.jstor.org/browse?citationAction=remov wait();'); -REPLACE INTO "scrapers" VALUES('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-18 11:02:00', 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/journals/.+/.+/.+\.html', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +REPLACE INTO "scrapers" VALUES('e85a3134-8c1a-8644-6926-584c8565f23e', '2006-06-25 14:33:00', 'History Cooperative Scraper', 'Simon Kornblith', '^http://www\.historycooperative\.org/(?:journals/.+/.+/.+\.html$|cgi-bin/search.cgi)', NULL, +'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; -var uri = doc.location.href; - -var month, year; - -var metaTags = doc.getElementsByTagName("meta"); - -function associateMeta(field, rdfUri) { +function associateMeta(uri, metaTags, field, rdfUri) { var field = metaTags.namedItem(field); if(field) { model.addStatement(uri, rdfUri, field.getAttribute("content"), false); } } -associateMeta("Title", prefixDC + "title"); -associateMeta("Journal", prefixDummy + "publication"); -associateMeta("Volume", prefixDummy + "volume"); -associateMeta("Issue", prefixDummy + "number"); - -var author = metaTags.namedItem("Author"); -if(author) { - var authors = author.getAttribute("content").split(" and "); - for(j in authors) { - model.addStatement(uri, prefixDC + "creator", authors[j], false); +function scrape(doc) { + var uri = doc.location.href; + var month, year; + var metaTags = doc.getElementsByTagName("meta"); + associateMeta(uri, metaTags, "Title", prefixDC + "title"); + associateMeta(uri, metaTags, "Journal", prefixDummy + "publication"); + associateMeta(uri, metaTags, "Volume", prefixDummy + "volume"); + associateMeta(uri, metaTags, "Issue", prefixDummy + "number"); + + var author = metaTags.namedItem("Author"); + if(author) { + var authors = author.getAttribute("content").split(" and "); + for(j in authors) { + model.addStatement(uri, prefixDC + "creator", authors[j], false); + } + } + + var month = metaTags.namedItem("PublicationMonth"); + var year = metaTags.namedItem("PublicationYear"); + if(month && year) { + model.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false); } + + model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); } -var month = metaTags.namedItem("PublicationMonth"); -var year = metaTags.namedItem("PublicationYear"); -if(month && year) { - model.addStatement(uri, prefixDC + "date", month.getAttribute("content")+" "+year.getAttribute("content"), false); +if(doc.title == "History Cooperative: Search Results") { + var items = utilities.getItemArray(doc, doc, ''^http://[^/]+/journals/.+/.+/.+\.html$''); + items = utilities.selectItems(items); + + if(!items) { + return true; + } + + var uris = new Array(); + for(i in items) { + uris.push(i); + } + + utilities.processDocuments(browser, null, uris, function(browser) { scrape(browser.contentDocument) }, + function() { done(); }, function() {}); + + wait(); +} else { + scrape(doc); } - -model.addStatement(uri, prefixRDF + "type", prefixDummy + "journalArticle", false); '); REPLACE INTO "scrapers" VALUES('4fd6b89b-2316-2dc4-fd87-61a97dd941e8', '2006-06-23 12:49:00', 'InnoPAC Scraper', 'Simon Kornblith', '^http://[^/]+/(?:search/|record=)', @@ -779,11 +794,8 @@ if(newUri) { utilities.HTTPUtilities.doGet(clearUrl, null, function() { - utilities.debugPrint(clearUrl); utilities.HTTPUtilities.doPost(postUrl, postString, null, function() { - utilities.debugPrint(postUrl + " " + postString); utilities.HTTPUtilities.doPost(exportUrl, "ex_format=50&ex_device=45&SUBMIT=Submit", null, function(text) { - utilities.debugPrint(exportUrl); var records = text.split("\x1D"); for(var i=0; i<(records.length-1); i++) { var record = new MARC_Record();