commit 466cf794e1dfdbe758b94101bebc7558fc606cf2
parent 1caa7ac3597997654016b9200d1a94c25107a7c7
Author: Matt Burton <mcburton@gmail.com>
Date: Mon, 29 Mar 2010 15:01:48 +0000
Adding Sylvain's 15mar10 changes.
Diffstat:
1 file changed, 92 insertions(+), 72 deletions(-)
diff --git a/translators/Gallica.js b/translators/Gallica.js
@@ -8,53 +8,53 @@
"maxVersion":"",
"priority":100,
"inRepository":true,
- "lastUpdated":"2009-10-08 17:40:00"
+ "lastUpdated":"2010-02-20 10:40:00"
}
function detectWeb(doc, url) {
- var namespace = doc.documentElement.namespaceURI;
- var nsResolver = namespace ? function(prefix) {
- if (prefix == 'x') return namespace; else return null;
- } : null;
-
- var indexSearch = url.toString().indexOf('http://gallica.bnf.fr/Search');
- var indexArk = url.toString().indexOf('http://gallica.bnf.fr/ark:');
- var indexSNE = url.toString().indexOf('http://gallica.bnf.fr/VisuSNE');
-
- if (indexSearch == 0)
- {
- var errorXpath = '//div[@class="errorMessage"]';
- if (elt = doc.evaluate(errorXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
- // We are on a search page result but it can be an empty result page.
- // Nothing to return;
- }
- else
- {
- return "multiple";
- }
- }
- else if (indexArk == 0)
- {
- var iconxpath = '//div[@id="Infos"]/img';
- if (elt = doc.evaluate(iconxpath, doc, nsResolver,
-XPathResult.ANY_TYPE, null).iterateNext()) {
- var icon = elt.getAttribute('src');
- return getDoctypeGallica(icon);
- }
-
- // For some biblio, the icon picture is located in another div ...
- var iconxpath = '//div[@class="titrePeriodiqueGauche"]/img';
- if (elt = doc.evaluate(iconxpath, doc, nsResolver,
-XPathResult.ANY_TYPE, null).iterateNext()) {
- var icon = elt.getAttribute('src');
-
- return getDoctypeGallica(icon);
- }
- }
- else if (indexSNE == 0)
- {
- return "book";
- }
+ var namespace = doc.documentElement.namespaceURI;
+ var nsResolver = namespace ? function(prefix) {
+ if (prefix == 'x') return namespace; else return null;
+ } : null;
+
+ var indexSearch = url.toString().indexOf('http://gallica.bnf.fr/Search');
+ var indexArk = url.toString().indexOf('http://gallica.bnf.fr/ark:');
+ var indexSNE = url.toString().indexOf('http://gallica.bnf.fr/VisuSNE');
+
+ if (indexSearch == 0)
+ {
+ var errorXpath = '//div[@class="errorMessage"]';
+ if (elt = doc.evaluate(errorXpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext()) {
+ // We are on a search page result but it can be an empty result page.
+ // Nothing to return;
+ }
+ else
+ {
+ return "multiple";
+ }
+ }
+ else if (indexArk == 0)
+ {
+ var iconxpath = '//div[@class="contenu1"]/img';
+ if (elt = doc.evaluate(iconxpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext())
+ {
+ var icon = elt.getAttribute('src');
+ return getDoctypeGallica(icon);
+ }
+
+ // For some biblio, the icon picture is located in another div ...
+ var iconxpath = '//div[@class="titrePeriodiqueGauche"]/img';
+ if (elt = doc.evaluate(iconxpath, doc, nsResolver,
+ XPathResult.ANY_TYPE, null).iterateNext())
+ {
+ var icon = elt.getAttribute('src');
+ return getDoctypeGallica(icon);
+ }
+ }
+ else if (indexSNE == 0)
+ {
+ return "book";
+ }
}
// This function takes the name of the icon, and returns the Zotero item name
@@ -62,19 +62,29 @@ function getDoctypeGallica(img)
{
var iconname = img.substring(img.lastIndexOf('/') + 1);
- if ( (iconname =='doc_livre_ocr.png') || (iconname == 'doc_livre.png') )
+ if (iconname =='livre_a.png')
{
return "book";
}
- else if (iconname == 'doc_carte.png')
+ else if (iconname == 'carte.png')
{
return "map";
}
- else if (iconname == 'doc_image.png')
+ else if (iconname == 'images.png')
{
return "artwork";
}
- else if ( (iconname == 'doc_periodique.png') || (iconname == 'doc_perio_vol_ocr.png') )
+ else if (iconname == 'docsonore.png')
+ {
+ return "audioRecording";
+ }
+ else if (iconname == 'musiquenotee.png')
+ {
+ // This icon is for Sheet music type. But no Zotero type matches
+ // as of today (2010-02)
+ return "book";
+ }
+ else if ( (iconname == 'picto_type_document1.png') || (iconname == 'perio_vol_ocr.png') )
{
return "book";
}
@@ -95,52 +105,55 @@ function doWeb(doc, url) {
if (detectWeb(doc, url) == "multiple")
{
var availableItems = new Array();
- var xpath = '//td[@class="ResultatsRechercheInfos"]/a';
+ var xpath = '//div[@class="resultats_line"]';
var elmts = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null);
var elmt = elmts.iterateNext();
var itemsId = new Array();
- var i = 0;
+ var i = 1;
do {
- var id = doc.evaluate('../../..//a[@id]', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
- // This id looks like idN00000. We need the information after id to get the informations about
- // the title. We need to store it in an array, we leave the starting id.
- var cleanId = id.getAttribute('id').substring(2);
- itemsId[i] = cleanId;
-
- var searchTitle = elmt.textContent;
- availableItems[i] = searchTitle;
+ var id = doc.evaluate('div[@class="resultat_id"]', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext().textContent;
+ var this_result = doc.evaluate('div[@class="resultat_desc"]/div[@class="titre"]/a', elmt, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ availableItems[i] = Zotero.Utilities.cleanTags(this_result.getAttribute('title'));
i++;
} while (elmt = elmts.iterateNext());
-
+
var items = Zotero.selectItems(availableItems);
for (var i in items) {
// All informations are available on search result page. We don't need to query
// every subpage with scrape. We'are going to call the special Gallica scrape function
- // This function (scrapeGallica) is reused in scrape.
- var fullpath = '//div[@id="noticeComplete' + itemsId[i] + '"]/div';
- var detail = doc.evaluate(fullpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
- Zotero.debug(itemsId[i]);
- var iconType = doc.evaluate('//a[@id="id' + itemsId[i] + '"]/..//span[@class="typedoc"]/img', doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
- var docType = getDoctypeGallica(iconType.getAttribute('src'));
- Zotero.debug( itemsId[i]);
- scrapeGallica(doc, nsResolver, detail, docType);
+ // This function (scrapeGallica) is reused in scrape.
+ var fullpath = '//div[@class="resultats_line"][' + i + ']';
+
+ var item_element = doc.evaluate(fullpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ if (item_element != undefined)
+ {
+ var detail = doc.evaluate('.//div[@class="notice"]', item_element, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+
+ var iconType = doc.evaluate('.//div[@class="picto"]/img', item_element, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ var docType = getDoctypeGallica(iconType.getAttribute('src'));
+
+ var docUrl = doc.evaluate('.//div[@class="liens"]/a', item_element, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
+ docUrl = docUrl.getAttribute("href");
+
+ scrapeGallica(doc, nsResolver, detail, docType, docUrl);
+ }
}
}
else
{
var docType = detectWeb(doc, url);
- var xpath = '//div[@id="Popup1"]/div[@class="data"]';
+ var xpath = '//div[@class="notice"]';
var detail = doc.evaluate(xpath, doc, nsResolver, XPathResult.ANY_TYPE, null).iterateNext();
- scrapeGallica(doc, nsResolver, detail, docType);
+ scrapeGallica(doc, nsResolver, detail, docType, "");
}
}
-function scrapeGallica(doc, nsResolver, div, type)
+function scrapeGallica(doc, nsResolver, div, type, direct_url)
{
var item = new Zotero.Item;
item.itemType = type;
@@ -237,10 +250,17 @@ function scrapeGallica(doc, nsResolver, div, type)
}
} while (elmt = elmts.iterateNext());
-
+
if ( (item.url == "") || (item.url == undefined) )
{
- item.url = doc.location.href;
+ if (direct_url != "")
+ {
+ item.url = "http://gallica.bnf.fr" + direct_url;
+ }
+ else
+ {
+ item.url = doc.location.href;
+ }
}
item.complete();
}