commit d4a7e5bd375323e290161fa8a6f08b88cd471564
parent 19ff243b92f604d979722fceb67fa65cf055a8cb
Author: Matt Burton <mcburton@gmail.com>
Date: Tue, 7 Apr 2009 15:53:36 +0000
updating Sudoc.js with sylvain's changes
Diffstat:
| M | translators/Sudoc.js | | | 362 | +++++++++++++++++++++++++++++++++++++++++++++++-------------------------------- |
1 file changed, 216 insertions(+), 146 deletions(-)
diff --git a/translators/Sudoc.js b/translators/Sudoc.js
@@ -2,7 +2,7 @@
"translatorID":"1b9ed730-69c7-40b0-8a06-517a89a3a278",
"translatorType":4,
"label":"Sudoc",
- "creator":"Sean Takats and Michael Berkowitz",
+ "creator":"Sean Takats and Michael Berkowitz, updated by Sylvain Machefert",
"target":"^http://www\\.sudoc\\.abes\\.fr",
"minVersion":"1.0.0b3.r1",
"maxVersion":"",
@@ -11,42 +11,59 @@
"lastUpdated":"2008-05-19 17:30:00"
}
+
function detectWeb(doc, url) {
var namespace = doc.documentElement.namespaceURI;
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
- var xpath = '//table/tbody/tr/td[1][@class="preslabel"]/strong';
- var multxpath = '//a[@id="InitialFocusPoint"]';
- var elt;
-
+ var multxpath = '/html/body/div[2]/div/span';
if (elt = doc.evaluate(multxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
- return "multiple";
- }
- else if (elt = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
- {
- var contenu = elt.textContent;
- var numRegexp = /(Num.ro.de.notice|Record.number)/;
- var m = numRegexp.exec(contenu);
- if (m) {
- // On a bien une notice d"ouvrage, on doit chercher limage
- // pour choisir le type de document
- var imgXpath = '/html/body/table/tbody/tr/td[1]/p/img/@src';
- var imgsrc = doc.evaluate(imgXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
- if (imgsrc){
- if (imgsrc.indexOf("icon_per.gif") > 0){
- return "book";
- } else if (imgsrc.indexOf("icon_books.gif") > 0){
- return "book";
- } else if (imgsrc.indexOf("icon_thesis.gif") > 0){
- return "thesis";
- } else if (imgsrc.indexOf("icon_art.gif") > 0){
- return "journalArticle";
- } else {
- return "book";
- }
- }
+ var content = elt.textContent;
+ if ( (content == "Résultats") || (content == "Results") )
+ {
+ return "multiple";
+ }
+ else if ( (content == "Notice complète") || (content == "title data") )
+ {
+ var xpathimage = '/html/body/div[2]/div[4]/span/img';
+ if (elt = doc.evaluate(xpathimage, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
+ {
+ var type = elt.getAttribute('src');
+ if (type.indexOf('article.gif') > 0)
+ {
+ return "journalArticle";
+ }
+ else if (type.indexOf('book.gif') > 0)
+ {
+ return "book";
+ }
+ else if (type.indexOf('handwriting.gif') > 0)
+ {
+ return "manuscript";
+ }
+ else if (type.indexOf('sons.gif') > 0)
+ {
+ return "audioRecording";
+ }
+ else if (type.indexOf('sound.gif') > 0)
+ {
+ return "audioRecording";
+ }
+ else if (type.indexOf('thesis.gif') > 0)
+ {
+ return "thesis";
+ }
+ else if (type.indexOf('map.gif') > 0)
+ {
+ return "map";
+ }
+ else
+ {
+ return "book";
+ }
+ }
}
}
}
@@ -56,105 +73,154 @@ function scrape(doc) {
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
+
+ var zXpath = '/html/body/span[@class="Z3988"]';
+ var eltCoins = doc.evaluate(zXpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ if (eltCoins = doc.evaluate(zXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
+ {
+ var coins = eltCoins.getAttribute('title');
- var rowXpath = '//tr[td[@class="preslabel"]]';
- var tableRows = doc.evaluate(rowXpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
- var tableRow;
+ var newItem = new Zotero.Item();
+ newItem.repository = false; // do not save repository
+ if(Zotero.Utilities.parseContextObject(coins, newItem))
+ {
+ if (newItem.title)
+ {
+ // We use the same method as in detectWeb to find
+ // the real type of document
+ var xpathimage = '/html/body/div[2]/div[4]/span/img';
+ if (elt = doc.evaluate(xpathimage, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
+ {
+ var type = elt.getAttribute('src');
+ var ZoteroType = '';
+ if (type.indexOf('article.gif') > 0)
+ {
+ zoteroType = 'journalArticle';
+ }
+ else if (type.indexOf('book.gif') > 0)
+ {
+ zoteroType = 'book';
+ }
+ else if (type.indexOf('handwriting.gif') > 0)
+ {
+ zoteroType = 'manuscript';
+ }
+ else if (type.indexOf('sons.gif') > 0)
+ {
+ zoteroType = "audioRecording";
+ }
+ else if (type.indexOf('sound.gif') > 0)
+ {
+ zoteroType = "audioRecording";
+ }
+ else if (type.indexOf('thesis.gif') > 0)
+ {
+ zoteroType = "thesis";
+ }
+ else if (type.indexOf('map.gif') > 0)
+ {
+ zoteroType = "map";
+ }
+ else
+ {
+ zoteroType = "book";
+ }
+ newItem.itemType = zoteroType;
+ }
+
+ // We need to correct some informations where COinS is wrong
+ var rowXpath = '//tr[td[@class="rec_lable"]]';
+ var tableRows = doc.evaluate(rowXpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ var tableRow;
+
+ while (tableRow = tableRows.iterateNext())
+ {
+ var field = doc.evaluate('./td[1]', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
+ var value = doc.evaluate('./td[2]', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
+ field = Zotero.Utilities.superCleanString(field);
+ field = field.replace(/(\(s\))?\s*:\s*$/, "");
- var newItem = new Zotero.Item();
- // TODO add other item types using detectWeb's icon checking code
- newItem.itemType = "book";
- var imgXpath = '/html/body/table/tbody/tr/td[1]/p/img/@src';
- var imgsrc = doc.evaluate(imgXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
- if (imgsrc){
- if (imgsrc.indexOf("icon_per.gif") > 0){
- newItem.itemType = "book";
- } else if (imgsrc.indexOf("icon_books.gif") > 0){
- newItem.itemType = "book";
- } else if (imgsrc.indexOf("icon_thesis.gif") > 0){
- newItem.itemType = "thesis";
- } else if (imgsrc.indexOf("icon_art.gif") > 0){
- newItem.itemType = "journalArticle";
- } else {
- newItem.itemType = "book";
- }
- } else {
- newItem.itemType = "book";
- }
- while (tableRow = tableRows.iterateNext())
- {
- var field = doc.evaluate('./td[1]', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
- var value = doc.evaluate('./td[2]', tableRow, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
- field = Zotero.Utilities.superCleanString(field);
- field = field.replace(/(\(s\))?\s*:\s*$/, "");
- if (field == "Titre" || field == "Title"){
- Zotero.debug("title = " + value);
- value = value.replace(/(\[[^\]]+\])/g,"");
- newItem.title = value.split(" / ")[0];
- }
- if (field.substr(0,6) == "Auteur" || field.substr(0,6) == "Author"){
- var authors = doc.evaluate('./td[2]/a', tableRow, nsResolver, XPathResult.ANY_TYPE, null);
- var author;
- while (author = authors.iterateNext()){
+ // With COins, only one author is taken, changed.
+ if (field.substr(0,6) == "Auteur" || field.substr(0,6) == "Author")
+ {
+ var authors = doc.evaluate('./td[2]/div', tableRow, nsResolver, XPathResult.ANY_TYPE, null);
+ newItem.creators = new Array();
+ while (author = authors.iterateNext())
+ {
var authorText = author.textContent;
- var authorParts = authorText.split(" (");
- newItem.creators.push(Zotero.Utilities.cleanAuthor(authorParts[0], "author", true));
+
+ authorFunction = authorText.split(". ")[1];
+ authorText = authorText.split(". ")[0];
+ if (authorFunction)
+ {
+ authorFunction = Zotero.Utilities.superCleanString(authorFunction);
+ }
+ var zoteroFunction = '';
+ // TODO : Add other authotiry types
+ if (authorFunction == 'Traduction')
+ {
+ zoteroFunction = 'Translator';
+ }
+ else
+ {
+ zoteroFunction = 'Author';
+ }
+ newItem.creators.push(Zotero.Utilities.cleanAuthor(authorText, zoteroFunction, true));
+ }
}
- }
- if (field.substr(0,4) == "Date"){
- newItem.date = value;
- }
- if (field.substr(0,7) == "Editeur" || field.substr(0,9) == "Publisher"){
- var pubParts = value.split(" : ");
- newItem.place = pubParts[0];
- // needs error checking below to avoid error
- if (pubParts[1] ) {
- pubParts = pubParts[1].split(", ");
- newItem.publisher = pubParts[0];
+ // The serie isn't in COinS
+ else if (field.substr(0,5) == "Serie" || field.substr(0,10) == "Collection")
+ {
+ newItem.series = value;
}
- }
- if (field.substr(0,4) == "ISBN" || field.substr(0,4) == "ISSN"){
- newItem.ISBN = value.split(" (")[0];
- }
- if (field == "Description") {
- var m = value.match(/([0-9]+) (?:[pP])/);
- if (m) {
- newItem.pages = m[1];
+ // When there's a subtitle, only main title is used !
+ else if (field == "Titre" || field == "Title")
+ {
+ var title = '';
+ var titles = doc.evaluate('./td[2]/div/span', tableRow, nsResolver, XPathResult.ANY_TYPE, null);
+ while (partTitle = titles.iterateNext())
+ {
+ partTitle = partTitle.textContent;
+ partTitle = partTitle.replace(/(\[[^\]]+\] ?)/g,"");
+ title = title + partTitle;
+ }
+ // Remove the author
+ title = title.split(" / ")[0];
+ newItem.title = title;
}
- }
- if (field.substr(0,5) == "Serie" || field.substr(0,10) == "Collection"){
- newItem.series = value;
- }
- if (field.substr(0,6) == "Sujets" || field.substr(0,8) == "Subjects"){
- var subjectElmts = doc.evaluate('./td[2]/a', tableRow, nsResolver, XPathResult.ANY_TYPE, null);
- var subject;
- var subjects;
- while (subject = subjectElmts.iterateNext()){
- subjects = subject.textContent.split(" -- ");
- newItem.tags = newItem.tags.concat(subjects);
+ // Language not defined in COinS
+ else if ( (field == "Langue") || (field == "Language") )
+ {
+ newItem.language = value;
}
+ else if ( (field == "Résumé") || (field == "Abstract") )
+ {
+ if (newItem.abstractNote)
+ {
+ newItem.abstractNote = newItem.abstractNote + " " + value;
+ }
+ else
+ {
+ newItem.abstractNote = value;
+ }
+
+ }
+ else if (field == "Notes")
+ {
+ if (newItem.abstractNote)
+ {
+ newItem.abstractNote = newItem.abstractNote + " " + value;
+ }
+ else
+ {
+ newItem.abstractNote = value;
+ }
+ }
+ }
+ newItem.complete();
}
- if (field == "In" || field == "Dans"){
- var jtitle = value.replace(/(\[[^\]]+\])/g,"");
- jtitle = jtitle.split(" / ")[0];
- jtitle = jtitle.split(" - ")[0];
- newItem.publicationTitle = jtitle;
- //get page numbers
- var m = value.match(/(?:[Pp]\. )([0-9\-]+)/);
- if (m) {
- newItem.pages = m[1];
- }
- //get ISBN or ISSN
- m = value.match(/(?:ISSN|ISBN) ([0-9Xx\-]+)/);
- if (m) {
- newItem.ISBN = m[1];
- newItem.ISSN = m[1];
- }
- // publicationTitle, issue/volume
- }
- // TODO Pages, Notes, Description, Language, Annexes
+ }
}
- newItem.complete();
}
function doWeb(doc, url) {
@@ -162,39 +228,44 @@ function doWeb(doc, url) {
var nsResolver = namespace ? function(prefix) {
if (prefix == 'x') return namespace; else return null;
} : null;
-
- var multxpath = '//a[@id="InitialFocusPoint"]';
- var elt;
-
+
+ var multxpath = '/html/body/div[2]/div/span';
if (elt = doc.evaluate(multxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
- var newUrl = doc.evaluate('//base/@href', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
- var xpath = '//tr/td[3]/a';
- var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
- var elmt = elmts.iterateNext();
- var links = new Array();
- var availableItems = new Array();
- var i = 0;
- do {
+ var content = elt.textContent;
+ if ( (content == "Résultats") || (content == "Results") )
+ {
+ var newUrl = doc.evaluate('//base/@href', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
+ var xpath = '/html/body/div[2]/table/tbody/tr/td[3]/div/a';
+ var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
+ var elmt = elmts.iterateNext();
+ var links = new Array();
+ var availableItems = new Array();
+ var i = 0;
+ do {
var link = doc.evaluate('./@href', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().nodeValue;
var searchTitle = elmt.textContent;
availableItems[i] = searchTitle;
links[i] = link;
i++;
- } while (elmt = elmts.iterateNext());
- var items = Zotero.selectItems(availableItems);
+ } while (elmt = elmts.iterateNext());
+ var items = Zotero.selectItems(availableItems);
- if(!items) {
+ if(!items) {
return true;
+ }
+
+ var uris = new Array();
+ for(var i in items) {
+ uris.push(newUrl + links[i]);
+ Zotero.debug(newUrl + links[i]);
+ }
+ Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
+ function() { Zotero.done(); }, null);
+ Zotero.wait();
}
- var uris = new Array();
- for(var i in items) {
- uris.push(newUrl + links[i]);
+ else if ( (content == "Notice complète") || (content == 'title data') )
+ {
+ scrape(doc);
}
- Zotero.Utilities.processDocuments(uris, function(doc) { scrape(doc) },
- function() { Zotero.done(); }, null);
- Zotero.wait();
}
- else {
- scrape(doc);
- }
-}
-\ No newline at end of file
+}