www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

commit 968348a5d149f0b6f3f8129fa6541e7ed786689d
parent a3df0c39e289fa772e71062b32d6eea370a4898c
Author: Simon Kornblith <simon@simonster.com>
Date:   Tue, 20 Jun 2006 16:08:13 +0000

Add a scraper for Dublin Core metadata embedded in HTML/XHTML META tags



Diffstat:
Mscrapers.sql | 38++++++++++++++++++++++++++++++++++++--
1 file changed, 36 insertions(+), 2 deletions(-)

diff --git a/scrapers.sql b/scrapers.sql @@ -1345,4 +1345,38 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) { }) model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false); -wait();'); -\ No newline at end of file +wait();'); + +REPLACE INTO "scrapers" VALUES('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-20 10:52:00', 'Scraper for Dublin Core expressed as HTML META elements', 'Simon Kornblith', NULL, +'var metaTags = doc.getElementsByTagName("meta"); + +if(metaTags) { + for(var i=0; i<metaTags.length; i++) { + var tag = metaTags[i].getAttribute("name"); + var value = metaTags[i].getAttribute("content"); + if(tag && value && tag.substr(0, 3).toLowerCase() == "dc.") { + return true; + } + } +} +return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#''; +var prefixDC = ''http://purl.org/dc/elements/1.1/''; +var prefixDCMI = ''http://purl.org/dc/dcmitype/''; +var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/''; + +var uri = doc.location.href; + +var metaTags = doc.getElementsByTagName("meta"); + +for(var i=0; i<metaTags.length; i++) { + var tag = metaTags[i].getAttribute("name"); + var value = metaTags[i].getAttribute("content"); + if(tag && value && tag.substr(0, 3).toLowerCase() == "dc.") { + var suffix = tag.substr(3); + if(suffix == "creator") { + // Everyone uses different methods of encoding the DC creator; clean them + value = utilities.cleanAuthor(value); + } + model.addStatement(uri, prefixDC + suffix, value, true); + } +}'); +\ No newline at end of file