commit 968348a5d149f0b6f3f8129fa6541e7ed786689d
parent a3df0c39e289fa772e71062b32d6eea370a4898c
Author: Simon Kornblith <simon@simonster.com>
Date: Tue, 20 Jun 2006 16:08:13 +0000
Add a scraper for Dublin Core metadata embedded in HTML/XHTML META tags
Diffstat:
1 file changed, 36 insertions(+), 2 deletions(-)
diff --git a/scrapers.sql b/scrapers.sql
@@ -1345,4 +1345,38 @@ utilities.HTTPUtilities.doGet(newUri, null, function(text) {
})
model.addStatement(uri, prefixRDF + "type", prefixDummy + "journal", false);
-wait();');
-\ No newline at end of file
+wait();');
+
+REPLACE INTO "scrapers" VALUES('951c027d-74ac-47d4-a107-9c3069ab7b48', '2006-06-20 10:52:00', 'Scraper for Dublin Core expressed as HTML META elements', 'Simon Kornblith', NULL,
+'var metaTags = doc.getElementsByTagName("meta");
+
+if(metaTags) {
+ for(var i=0; i<metaTags.length; i++) {
+ var tag = metaTags[i].getAttribute("name");
+ var value = metaTags[i].getAttribute("content");
+ if(tag && value && tag.substr(0, 3).toLowerCase() == "dc.") {
+ return true;
+ }
+ }
+}
+return false;', 'var prefixRDF = ''http://www.w3.org/1999/02/22-rdf-syntax-ns#'';
+var prefixDC = ''http://purl.org/dc/elements/1.1/'';
+var prefixDCMI = ''http://purl.org/dc/dcmitype/'';
+var prefixDummy = ''http://chnm.gmu.edu/firefox-scholar/'';
+
+var uri = doc.location.href;
+
+var metaTags = doc.getElementsByTagName("meta");
+
+for(var i=0; i<metaTags.length; i++) {
+ var tag = metaTags[i].getAttribute("name");
+ var value = metaTags[i].getAttribute("content");
+ if(tag && value && tag.substr(0, 3).toLowerCase() == "dc.") {
+ var suffix = tag.substr(3);
+ if(suffix == "creator") {
+ // Everyone uses different methods of encoding the DC creator; clean them
+ value = utilities.cleanAuthor(value);
+ }
+ model.addStatement(uri, prefixDC + suffix, value, true);
+ }
+}');
+\ No newline at end of file