www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit fd2052e63c289d08a4e018169460c89cace90710
parent 260ce80086759bcf2321b74105a3fd157c5321f9
Author: Simon Kornblith <simon@simonster.com>
Date:   Sat, 24 Jun 2006 17:33:35 +0000

Search results scraping for PubMed and Google Books. This marks the end of what I can do with respect to #15 until I'm at home or CHNM, where I'll have access to the gated collections.


Diffstat:
Mscrapers.sql | 94+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--------------------
1 file changed, 71 insertions(+), 23 deletions(-)

diff --git a/scrapers.sql b/scrapers.sql @@ -1,7 +1,7 @@ --- 15 +-- 16 -- Set the following timestamp to the most recent scraper update date -REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-24 11:22:00')); +REPLACE INTO "version" VALUES ('repository', STRFTIME('%s', '2006-06-24 13:31:00')); REPLACE INTO "scrapers" VALUES('96b9f483-c44d-5784-cdad-ce21b984fe01', '2006-06-22 22:58:00', 'Amazon.com Scraper', 'Simon Kornblith', '^http://www\.amazon\.com/(?:gp/(?:product|search)/|exec/obidos/search-handle-url/)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; @@ -1747,23 +1747,52 @@ for(i in elmts) { model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);'); -REPLACE INTO "scrapers" VALUES('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-18 11:19:00', 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +REPLACE INTO "scrapers" VALUES('fcf41bed-0cbc-3704-85c7-8062a0068a7a', '2006-06-24 13:17:00', 'PubMed Scraper', 'Simon Kornblith', '^http://www\.ncbi\.nlm\.nih\.gov/entrez/query\.fcgi\?(?:.*db=PubMed.*list_uids=[0-9]|.*list_uids=[0-9].*db=PubMed|.*db=PubMed.*CMD=search|.*CMD=search.*db=PubMed)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; -function mapRDF(text, rdfUri) { +function mapRDF(uri, text, rdfUri) { if(text != "") { model.addStatement(uri, rdfUri, text, true); } } var uri = doc.location.href; -var newUri = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=PubMed&retmode=xml&rettype=citation&id="; +var ids = new Array(); var idRegexp = /[\?\&]list_uids=([0-9\,]+)/; + var m = idRegexp.exec(uri); -newUri += m[1]; +if(m) { + ids.push(m[1]); +} else { + var namespace = doc.documentElement.namespaceURI; + var nsResolver = namespace ? function(prefix) { + if (prefix == ''x'') return namespace; else return null; + } : null; + + var items = new Array(); + var tableRows = utilities.gatherElementsOnXPath(doc, doc, ''//div[@class="ResultSet"]/table/tbody'', nsResolver); + // Go through table rows + for(var i=0; i<tableRows.length; i++) { + var link = utilities.getNode(doc, tableRows[i], ''.//a'', nsResolver); + var article = utilities.getNode(doc, tableRows[i], ''./tr[2]/td[2]/text()[1]'', nsResolver); + items[link.href] = article.nodeValue; + } + + items = utilities.selectItems(items); + + if(!items) { + return true; + } + + for(i in items) { + var m = idRegexp.exec(i); + ids.push(m[1]); + } +} +var newUri = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=PubMed&retmode=xml&rettype=citation&id="+ids.join(","); utilities.HTTPUtilities.doGet(newUri, null, function(text) { // Remove xml parse instruction and doctype text = text.replace(/<!DOCTYPE[^>]*>/, "").replace(/<\?xml[^>]*\?>/, ""); @@ -1773,6 +1802,7 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) { for(var i=0; i<xml.PubmedArticle.length(); i++) { var citation = xml.PubmedArticle[i].MedlineCitation; + var uri = "http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=pubmed&cmd=Retrieve&list_uids="+citation.PMID.text(); if(citation.PMID.length()) { model.addStatement(uri, prefixDC + "identifier", "PMID "+citation.PMID.text(), true); } @@ -1793,14 +1823,14 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) { } if(article.Journal.Title.length()) { - model.addStatement(uri, prefixDummy + "publication", utilities.superCleanString(article.Journal.Title.text()), true); + model.addStatement(uri, prefixDummy + "publication", utilities.superCleanString(article.Journal.Title.text().toString()), true); } else if(citation.MedlineJournalInfo.MedlineTA.length()) { - model.addStatement(uri, prefixDummy + "publication", utilities.superCleanString(citation.MedlineJournalInfo.MedlineTA.text()), true); + model.addStatement(uri, prefixDummy + "publication", utilities.superCleanString(citation.MedlineJournalInfo.MedlineTA.text().toString()), true); } if(article.Journal.JournalIssue.length()) { - mapRDF(article.Journal.JournalIssue.Volume.text(), prefixDummy + "volume"); - mapRDF(article.Journal.JournalIssue.Issue.text(), prefixDummy + "number"); + mapRDF(uri, article.Journal.JournalIssue.Volume.text(), prefixDummy + "volume"); + mapRDF(uri, article.Journal.JournalIssue.Issue.text(), prefixDummy + "number"); if(article.Journal.JournalIssue.PubDate.length()) { model.addStatement(uri, prefixDC + "date", article.Journal.JournalIssue.PubDate.Day.text()+" "+article.Journal.JournalIssue.PubDate.Month.text()+" "+article.Journal.JournalIssue.PubDate.Year.text(), true); } @@ -1810,22 +1840,22 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) { if(article.AuthorList.length() && article.AuthorList.Author.length()) { var authors = article.AuthorList.Author; for(var j=0; j<authors.length(); j++) { - var lastName = authors[j].LastName.text(); - var firstName = authors[j].FirstName.text(); + var lastName = authors[j].LastName.text().toString(); + var firstName = authors[j].FirstName.text().toString(); if(firstName == "") { - var firstName = authors[j].ForeName.text(); + var firstName = authors[j].ForeName.text().toString(); } if(firstName && lastName) { model.addStatement(uri, prefixDC + "creator", firstName + " " + lastName); } } } + model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); } done(); }) -model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); wait();'); REPLACE INTO "scrapers" VALUES('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-20 10:52:00', 'Scraper for Dublin Core expressed as HTML META elements', 'Simon Kornblith', NULL, @@ -1862,21 +1892,41 @@ for(var i=0; i<metaTags.length; i++) { } }'); -REPLACE INTO "scrapers" VALUES('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-21 10:28:00', 'Google Books Scraper', 'Simon Kornblith', 'http://books\.google\.com/books\?vid=.*\&id=.*', NULL, +REPLACE INTO "scrapers" VALUES('3e684d82-73a3-9a34-095f-19b112d88bbf', '2006-06-24 13:31:00', 'Google Books Scraper', 'Simon Kornblith', 'http://books\.google\.com/books\?(.*vid=.*\&id=.*|.*q=.*)', NULL, 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; var prefixDC = ''http://purl.org/dc/elements/1.1/''; var prefixDCMI = ''http://purl.org/dc/dcmitype/''; var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; var uri = doc.location.href; +var newUris = new Array(); + var re = new RegExp(''http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''i''); -var urlParts = re.exec(uri); -var newUri = ''http://books.google.com/books?vid=''+urlParts[1]+''&id=''+urlParts[2]; +var m = re.exec(uri); +if(m) { + newUris.push(''http://books.google.com/books?vid=''+m[1]+''&id=''+m[2]); +} else { + var items = utilities.getItemArray(doc, doc, ''http://books\\.google\\.com/books\\?vid=([^&]+).*\\&id=([^&]+)'', ''^(?:All matching pages|About this Book)''); -utilities.debugPrint(newUri); + // Drop " - Page" thing + for(i in items) { + items[i] = items[i].replace(/- Page [0-9]+\s*$/, ""); + } + items = utilities.selectItems(items); + + if(!items) { + return true; + } + + for(i in items) { + var m = re.exec(i); + newUris.push(''http://books.google.com/books?vid=''+m[1]+''&id=''+m[2]); + } +} -utilities.loadDocument(newUri, browser, function(newBrowser) { - newDoc = newBrowser.contentDocument; +utilities.processDocuments(browser, null, newUris, function(newBrowser) { + var newDoc = newBrowser.contentDocument; + var uri = newDoc.location.href; var namespace = newDoc.documentElement.namespaceURI; var nsResolver = namespace ? function(prefix) { @@ -1923,8 +1973,6 @@ utilities.loadDocument(newUri, browser, function(newBrowser) { } } model.addStatement(uri, prefixRDF + "type", prefixDummy + "book", false); - - done(); -}, function() {}); +}, function() { done(); }, function() {}); wait();'); \ No newline at end of file