commit e636bd9c2e8e713ebf594a4859be8bb0b6fae00b
parent c9c5fa79f506719d8be6bb2a9258ed244c7753a6
Author: Avram Lyon <ajlyon@gmail.com>
Date: Thu, 7 Oct 2010 20:36:57 +0000
Trans: Typo fix to BnF, per Ziche; new version of CNKI by Ace Strong
Diffstat:
2 files changed, 687 insertions(+), 150 deletions(-)
diff --git a/translators/Bibliothèque nationale de France.js b/translators/Bibliothèque nationale de France.js
@@ -129,7 +129,7 @@ var BnfClass = function() {
//Default
case "205":
default:
- return "contibutor";
+ return "contributor";
}
};
diff --git a/translators/CNKI.js b/translators/CNKI.js
@@ -2,13 +2,13 @@
"translatorID":"5c95b67b-41c5-4f55-b71a-48d5d7183063",
"label":"CNKI",
"creator":"Ace Strong<acestrong@gmail.com> and Heromyth<zxpmyth@yahoo.com.cn>",
- "target":"^https?://(?:(?:(dlib|epub|ckrd)(?:.edu)?.cnki.net)|(?:[0-9.]+))/(?:kns50|grid2008|grid20)",
+ "target":"^https?://(?:(?:(dlib|epub|acad|apj1|law1)\\.cnki\\.net)|(?:[0-9\\.]+))/(?:grid2008|kns50|Kns55|kcms)",
"minVersion":"2.0.b4",
"maxVersion":"",
"priority":100,
- "inRepository":true,
+ "inRepository":"1",
"translatorType":4,
- "lastUpdated":"2010-09-26 15:08:45"
+ "lastUpdated":"2010-10-07 15:58:33"
}
/*
@@ -29,189 +29,726 @@
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/
+// #######################
+// ##### Sample URLs #####
+// #######################
-function detectWeb(doc, url) {
- var articleRe = /detail.aspx/;
- var s = articleRe.exec(url);
+/*
+ * The starting point for an search is the URL below.
+ * In testing, I tried the following:
+ *
+ * - A search listing of journals
+ * - A search listing of phd thesis
+ * - A search listing of master thesis
+ * - A search listing of conference papers
+ * - A search listing of newspaper articles
+ * - A journal paper page
+ * - A phd thesis page
+ * - A master thesis page
+ * - A conference paper page
+ * - A newspaper article page
+ */
+// http://epub.cnki.net/grid2008/index/ZKCALD.htm
- if(s) {
- return "journalArticle";
- } else {
- articleRe = /Brief.aspx/;
- s = articleRe.exec(url);
- if(s)
- return "multiple";
+// #################################
+// #### Local utility functions ####
+// #################################
+
+function detectCode(url) {
+ var pattern = /(?:dbcode|dbname)=([A-Z]{4})/i;
+ if (pattern.test(url)) {
+ var code = pattern.exec(url)[1];
+ return code;
}
+}
- return false;
+function getResolver(doc) {
+ var namespace, resolver;
+ namespace = doc.documentElement.namespaceURI;
+ if (namespace) {
+ resolver = function(prefix) {
+ if (prefix == 'x') {
+ return namespace;
+ } else {
+ return null;
+ }
+ };
+ } else {
+ resolver = null;
+ }
+ return resolver;
+}
+
+function trimTags(text) {
+ var pattern = /(<.*?>)/g;
+ text = text.replace(pattern, "");
+ return text;
+}
+
+function trimMultiline(text) {
+ var pattern = /(\s{2,})/g;
+ text = text.replace(pattern, "\n");
+ return text;
}
-function scrape(doc, url) {
- //var namespace = doc.documentElement.namespaceURI;
- //var nsResolver = namespace ? function(prefix) {
- // if (prefix == "x") return namespace; else return null;
- //} : null;
- var nsResolver = null;
+// #############################
+// ##### Scraper functions #####
+// #############################
+// work for journalArticle
+function scrapeAndParse1(url) {
+// Zotero.debug("journalArticle");
+ var page = Zotero.Utilities.retrieveSource(url);
+ var pattern;
+
+ // 类型 & URL
var itemType = "journalArticle";
- // TODO: 因为中英文信息都不想丢失,所以存为两个Item,也算是中国特色吧~
- // 但是目前只解析出中文的信息,下个版本中添加英文信息。
var newItem = new Zotero.Item(itemType);
- //Zotero.debug(itemType);
+// Zotero.debug(url);
newItem.url = url;
// 标题
- var titles = doc.title.split('-').slice(0,-1);
- //Zotero.debug(titles);
- var title = titles.join("-");
- Zotero.debug("Title:"+title);
- newItem.title = title;
-
- // 附件,网页快照
- var snapName = title + " (CNKI)";
- //Zotero.debug(snapName);
- //newItem.attachments.push({document:doc, title:snapName});
- newItem.attachments.push({url:newItem.url, snapshot:true, title:snapName, mimeType:"text/html"});
- //Zotero.debug(doc);
-
- // 其他信息,/html/body/table[4]/tbody/tr/td[2]/table/tbody/tr/td[2]/table[2]/tbody
- var dataRows = doc.evaluate('//body/table[4]/tbody/tr/td[2]/table/tbody/tr/td[2]/table[2]/tbody/tr', doc, nsResolver,
- XPathResult.ANY_TYPE, null);
- var dataRow;
- while(dataRow = dataRows.iterateNext()) {
- var tds = dataRow.getElementsByTagName("td");
- var heading = Zotero.Utilities.trimInternal(tds[0].textContent);
- var content = tds[1];
- if(heading == "【作者中文名】" || heading == "【作者】") {
- //Zotero.debug("Authors:");
- var as = content.getElementsByTagName("a");
- //Zotero.debug(as.length);
- var i=0;
- for(i=0;i<as.length;i++) {
- var a = as[i];
- newItem.creators.push(Zotero.Utilities.cleanAuthor(a.textContent, "author", true));
- //Zotero.debug(a.textContent);
+ pattern = /<span (?:id="chTitle"|class='datatitle')>(.*?)<\/span>/;
+ if (pattern.test(page)) {
+ var title = trimTags(pattern.exec(page)[1]);
+ newItem.title = title;
+// Zotero.debug("title: "+title);
+ }
+
+ // 作者
+ var authorNames;
+ pattern = /【作者】(?:[\s\S]*?)GetLinkListEx\('(.*?);','/;
+ if (pattern.test(page)) {
+ authorNames = pattern.exec(page)[1].split(";");
+ } else {
+ pattern = /【作者】([\s\S]*?)<\/tr>/;
+ if (pattern.test(page)) {
+ authorNames = trimTags(pattern.exec(page)[1]).split(";");
+ }
+ }
+ if (authorNames) {
+ for (var i=0; i<authorNames.length; i++) {
+ var authorName = Zotero.Utilities.trim(authorNames[i]);
+ if (authorName.length > 0) {
+ newItem.creators.push(
+ Zotero.Utilities.cleanAuthor(authorNames[i],
+ "author", true));
+ }
+ }
+// Zotero.debug("authorNames:\n"+authorNames);
+ }
+
+ // 摘要
+ var abst;
+ pattern = /【摘要】\s*<[^>]*>(.*?)<\/span>/;
+ if (pattern.test(page)) {
+ abst = trimTags(pattern.exec(page)[1]);
+ } else {
+ pattern = /【摘要】([\s\S]*?)<\/tr>/;
+ if (pattern.test(page)) {
+ abst = trimTags(pattern.exec(page)[1]);
+ }
+ }
+ if (abst) {
+// Zotero.debug("abstract:\n"+abst);
+ newItem.abstractNote = Zotero.Utilities.trim(abst);
+ }
+ pattern = /【Abstract】\s*<[^>]*>(.*?)<\/span>/;
+ if (pattern.test(page)) {
+ abst = trimTags(pattern.exec(page)[1]);
+ } else {
+ pattern = /【英文摘要】([\s\S]*?)<\/tr>/;
+ if (pattern.test(page)) {
+ abst = trimTags(pattern.exec(page)[1]);
+ }
+ }
+ if (abst) {
+// Zotero.debug("abstract:\n"+abst);
+ if (newItem.abstractNote===undefined) {
+ newItem.abstractNote = Zotero.Utilities.trim(abst);
+ } else {
+ newItem.abstractNote = newItem.abstractNote + "\n"
+ + Zotero.Utilities.trim(abst);
+ }
+ }
+// Zotero.debug(newItem.abstractNote);
+
+ // 关键词
+ var tags;
+ pattern = /【关键词】(?:[\s\S]*?)KeywordFilter\('(.*?)'\),'kw'/;
+ if (pattern.test(page)) {
+ tags = pattern.exec(page)[1].split(";");
+ } else {
+ pattern = /【中文关键词】([\s\S]*?)<\/tr>/;
+ if (pattern.test(page)) {
+ tags = trimTags(pattern.exec(page)[1]).split(";");
+ }
+ }
+ if (tags) {
+ for (var i=0; i<tags.length; i++) {
+ var tag = Zotero.Utilities.trim(tags[i]);
+ if (tag.length>0 && newItem.tags.indexOf(tag)<0) {
+ newItem.tags.push(tag);
+ }
+ }
+// Zotero.debug("tags:\n"+tags);
+ }
+ pattern = /【Key words】(?:[\s\S]*?)GetLinkList\('(.*?)','kw'/;
+ if (pattern.test(page)) {
+ tags = pattern.exec(page)[1].split(";");
+ } else {
+ pattern = /【英文关键词】([\s\S]*?)<\/tr>/;
+ if (pattern.test(page)) {
+ tags = trimTags(pattern.exec(page)[1]).split(";");
+ }
+ }
+ if (tags) {
+ for (var i=0; i<tags.length; i++) {
+ var tag = Zotero.Utilities.trim(tags[i]);
+ if (tag.length>0 && newItem.tags.indexOf(tag)<0) {
+ newItem.tags.push(tag);
}
- } else if(heading == "【文献出处】" || heading == "【刊名】") {
- //Zotero.debug("Publication:");
- var as = content.getElementsByTagName("a");
- //Zotero.debug(as[0].textContent);
- //Zotero.debug(as[3].textContent);
- // 出版社
- newItem.publicationTitle = as[0].textContent;
- var parts = Zotero.Utilities.trimInternal(as[3].textContent);
- // 出版时间
- var year = parts.substr(0,4);
- //Zotero.debug(year);
- newItem.date = year;
- // 卷号或期号
- var pattern = /(.*)(期|卷)/
- var testStr = parts.split(" ")[1];
- //Zotero.debug(testStr);
- if (pattern.test(testStr)){
- var attr = pattern.exec(testStr);
- //Zotero.debug(attr[1]+":"+attr[2]);
- if(attr[2]=="期"){
- newItem.issue = attr[1];
- }else{
- newItem.volume = attr[1];
- }
+ }
+// Zotero.debug("tags:\n"+tags);
+ }
+
+ // 文献出处 & DOI & 出版时间
+ pattern = /【文献出处】([\s\S]*?)<\/a>/;
+ if (pattern.test(page)) {
+ var publicationTitle = trimTags(pattern.exec(page)[1]);
+ newItem.publicationTitle = Zotero.Utilities.trim(publicationTitle);
+// Zotero.debug("publicationTitle: "+publicationTitle);
+ }
+ var doi;
+ pattern = /【DOI】(.*?)<\/li>/;
+ if (pattern.test(page)) {
+ doi= pattern.exec(page)[1];
+ } else {
+ pattern = /【DOI】([\s\S]*?)<\/tr>/;
+ if (pattern.test(page)) {
+ doi= trimTags(pattern.exec(page)[1]);
+ }
+ }
+ if (doi) {
+ newItem.DOI = Zotero.Utilities.trim(doi);
+// Zotero.debug("doi: "+doi);
+ }
+ pattern = /【文献出处】(?:[\s\S]*?)(\d{4})年\s*(\d{2})(卷|期)/;
+ if (pattern.test(page)) {
+ var date = pattern.exec(page)[1];
+ newItem.date = date;
+ var val = pattern.exec(page)[2];
+ var attr = pattern.exec(page)[3];
+ if (attr == "卷") {
+ newItem.volume = val;
+ } else {
+ newItem.issue = val;
+ }
+// Zotero.debug("date: "+date);
+// Zotero.debug("val: "+val);
+// Zotero.debug("attr: "+attr);
+ }
+
+ newItem.complete();
+}
+
+// work for thesis
+function scrapeAndParse2(url) {
+// Zotero.debug("thesis");
+ var page = Zotero.Utilities.retrieveSource(url);
+ var pattern;
+
+ // 类型 & URL
+ var itemType = "thesis";
+ var newItem = new Zotero.Item(itemType);
+// Zotero.debug(url);
+ newItem.url = url;
+ var code = detectCode(url);
+ if (code == "CDFD") {
+ newItem.thesisType = "博士论文"
+ } else {
+ newItem.thesisType = "硕士论文"
+ }
+// Zotero.debug(newItem.thesisType);
+
+
+ // 标题
+ pattern = /<span (?:id="chTitle"|class='datatitle')>(.*?)<\/span>/;
+ if (pattern.test(page)) {
+ var title = pattern.exec(page)[1];
+ pattern = /(<.*?>)/g;
+ title = title.replace(pattern, "");
+ newItem.title = title;
+// Zotero.debug("title: "+title);
+ }
+
+ // 作者
+ pattern = /【作者】([\s\S]*?)<\/a>/;
+ if (pattern.test(page)) {
+ var authorNames = trimTags(pattern.exec(page)[1]).split(";");
+ for (var i=0; i<authorNames.length; i++) {
+ newItem.creators.push(
+ Zotero.Utilities.cleanAuthor(authorNames[i],
+ "author", true));
+ }
+// Zotero.debug("authorNames:\n"+authorNames);
+ }
+
+ // 导师
+ pattern = /【导师】([\s\S]*?)<\/a>/;
+ if (pattern.test(page)) {
+ var directors = trimTags(pattern.exec(page)[1]).split(";");
+ for (var i=0; i<directors.length; i++) {
+ newItem.creators.push(
+ Zotero.Utilities.cleanAuthor(trimTags(directors[i]),
+ "director", true));
+ }
+// Zotero.debug("directors: "+directors);
+ }
+
+ // 摘要
+ var abst;
+ pattern = /ReplaceFont\('ChDivSummary','(.*?)(?='\);ReplaceFont)/;
+ if (pattern.test(page)) {
+ abst = trimTags(pattern.exec(page)[1]);
+ } else {
+ pattern = /【中文摘要】([\s\S]*?)<\/tr>/;
+ if (pattern.test(page)) {
+ abst = trimTags(pattern.exec(page)[1]);
+ }
+ }
+ if (abst) {
+// Zotero.debug("abstract:\n"+abst);
+ newItem.abstractNote = trimMultiline(abst);
+ }
+ pattern = /ReplaceFont\('EnDivSummary','(.*?)(?='\);if)/;
+ if (pattern.test(page)) {
+ abst = trimTags(pattern.exec(page)[1]);
+ } else {
+ pattern = /【英文摘要】([\s\S]*?)<\/tr>/;
+ if (pattern.test(page)) {
+ abst = trimTags(pattern.exec(page)[1]);
+ }
+ }
+ if (abst) {
+// Zotero.debug("abstract:\n"+abst);
+ if (newItem.abstractNote===undefined) {
+ newItem.abstractNote = Zotero.Utilities.trim(abst);
+ } else {
+ newItem.abstractNote = newItem.abstractNote + "\n"
+ + trimMultiline(abst);
+ }
+ }
+// Zotero.debug(newItem.abstractNote);
+
+ // 关键词
+ var tags;
+ pattern = /【关键词】\s*<span[^>]*>(.*?)<\/a>*<\/span>/;
+ if (pattern.test(page)) {
+ tags = trimTags(pattern.exec(page)[1]).split(";");
+ } else {
+ pattern = /【关键词】([\s\S]*?)<\/tr>/;
+ if (pattern.test(page)) {
+ tags = trimTags(pattern.exec(page)[1]).split(";");
+ }
+ }
+ if (tags) {
+ for (var i=0; i<tags.length; i++) {
+ var tag = Zotero.Utilities.trim(tags[i]);
+ if (tag.length>0 && newItem.tags.indexOf(tag)<0) {
+ newItem.tags.push(tag);
}
- } else if(heading == "【摘要】" || heading == "【英文摘要】") {
- //Zotero.debug("Abstract:");
- var abstract = null;
- if (content.getElementsByTagName("font")[0] != null){
- abstract = content.getElementsByTagName("font")[0].textContent;
+ }
+// Zotero.debug("tags:\n"+tags);
+ }
+ pattern = /【Key words】\s*<span[^>]*>(.*?)<\/a>*<\/span>/;
+ if (pattern.test(page)) {
+ tags = trimTags(pattern.exec(page)[1]).split(";");
+ } else {
+ pattern = /【英文关键词】([\s\S]*?)<\/tr>/;
+ if (pattern.test(page)) {
+ tags = trimTags(pattern.exec(page)[1]).split(";");
+ }
+ }
+ if (tags) {
+ for (var i=0; i<tags.length; i++) {
+ var tag = Zotero.Utilities.trim(tags[i]);
+ if (tag.length>0 && newItem.tags.indexOf(tag)<0) {
+ newItem.tags.push(tag);
}
- else{
- // 有些地方没有字体,直接在td标签下就是摘要。
- abstract = content.textContent;
+ }
+// Zotero.debug("tags:\n"+tags);
+ }
+// Zotero.debug(newItem.tags);
+
+ // 出版学校 & DOI & 出版时间
+ var publisher;
+ pattern = /【网络出版投稿人】\s*<a[^>]*>(.*?)<\/a>/;
+ if (pattern.test(page)) {
+ publisher = pattern.exec(page)[1];
+ } else {
+ pattern = /【网络出版投稿人】([\s\S]*?)<\/tr>/;
+ if (pattern.test(page)) {
+ publisher = Zotero.Utilities.trim(
+ trimTags(pattern.exec(page)[1]));
+ }
+ }
+ if (publisher) {
+ pattern = /(.*?)((.*?))/;
+ if (pattern.test(publisher)) {
+ newItem.publisher = pattern.exec(publisher)[1];
+ newItem.place = pattern.exec(publisher)[2];
+ } else {
+ newItem.publisher = publisher;
+ }
+// Zotero.debug("publisher: "+publisher);
+ }
+ var doi;
+ pattern = /【DOI】(.*?)<\/li>/;
+ if (pattern.test(page)) {
+ doi= pattern.exec(page)[1];
+ } else {
+ pattern = /【DOI】([\s\S]*?)<\/tr>/;
+ if (pattern.test(page)) {
+ var doi= trimTags(pattern.exec(page)[1]);
+ }
+ }
+ if (doi) {
+ newItem.DOI = Zotero.Utilities.trim(doi);
+// Zotero.debug("doi: "+doi);
+ }
+ var date;
+ pattern = /【网络出版投稿时间】(.*?)\s*<\/li>/;
+ if (pattern.test(page)) {
+ date = pattern.exec(page)[1];
+ } else {
+ pattern = /【网络出版投稿时间】([\s\S]*?)\s*<\/tr>/;
+ if (pattern.test(page)) {
+ date = trimTags(pattern.exec(page)[1]);
+ }
+ }
+ if (date) {
+ newItem.date = Zotero.Utilities.trim(date);
+// Zotero.debug("date: "+date);
+ }
+
+ newItem.complete();
+}
+
+// work for conferencePaper
+function scrapeAndParse3(url) {
+// Zotero.debug("conferencePaper");
+ var page = Zotero.Utilities.retrieveSource(url);
+ var pattern;
+
+ // 类型 & URL
+ var itemType = "conferencePaper";
+ var newItem = new Zotero.Item(itemType);
+// Zotero.debug(url);
+ newItem.url = url;
+
+ // 标题
+ pattern = /<span id="chTitle">(.*?)<\/span>/;
+ if (pattern.test(page)) {
+ var title = trimTags(pattern.exec(page)[1]);
+ newItem.title = title;
+// Zotero.debug("title: "+title);
+ }
+
+ // 作者
+ pattern = /【作者】(.*?)<\/p>/;
+ if (pattern.test(page)) {
+ var authorNames = trimTags(pattern.exec(page)[1]).split(";");
+ for (var i=0; i<authorNames.length; i++) {
+ newItem.creators.push(
+ Zotero.Utilities.cleanAuthor(
+ Zotero.Utilities.trim(authorNames[i]),
+ "author", true));
+ }
+// Zotero.debug("authorNames:\n"+authorNames);
+ }
+
+ // 摘要
+ var abst;
+ pattern = /ReplaceFont\('ChDivSummary','(.*?)(?='\);ReplaceFont)/;
+ if (pattern.test(page)) {
+ abst = pattern.exec(page)[1];
+// Zotero.debug("raw:\n"+abst);
+ pattern = /(<.*?>)/g;
+ abst = abst.replace(pattern, "");
+// Zotero.debug("after:\n"+abst);
+ newItem.abstractNote = Zotero.Utilities.trim(abst);
+ }
+
+ pattern = /ReplaceFont\('EnDivSummary','(.*?)(?='\);if)/;
+ if (pattern.test(page)) {
+ abst = pattern.exec(page)[1];
+// Zotero.debug("raw:\n"+abst);
+ if (abst != undefined && abst != null) {
+ pattern = /(<.*?>)/g;
+ abst = abst.replace(pattern, "");
+// Zotero.debug("after:\n"+abst);
+
+ if (newItem.abstractNote===undefined) {
+ newItem.abstractNote = Zotero.Utilities.trim(abst);
+ } else {
+ newItem.abstractNote = newItem.abstractNote + "\n"
+ + Zotero.Utilities.trim(abst);
}
- //Zotero.debug(abstract);
- //Zotero.debug(newItem.abstractNote);
- if(newItem.abstractNote===undefined){
- newItem.abstractNote = Zotero.Utilities.trim(abstract);
- }else{
- newItem.abstractNote = newItem.abstractNote + "\n" + Zotero.Utilities.trim(abstract);
+ }
+ }
+// Zotero.debug("abst:\n"+newItem.abstractNote);
+
+ // 关键词
+ pattern = /【关键词】\s*<span[^>]*>(.*?)<\/a>*<\/span>/;
+ if (pattern.test(page)) {
+ var tags = trimTags(pattern.exec(page)[1]).split(";");
+ for (var i=0; i<tags.length; i++) {
+ var tag = Zotero.Utilities.trim(tags[i]);
+ if (tag.length>0 && newItem.tags.indexOf(tag)<0) {
+ newItem.tags.push(tag);
}
- } else if(heading == "【DOI】") {
- //Zotero.debug("DOI:");
- var doi = Zotero.Utilities.trimInternal(content.textContent);
- //Zotero.debug(doi);
- newItem.DOI = doi;
- } else if(heading == "【关键词】"||heading == "【英文关键词】"||heading == "【中文关键词】") {
- //Zotero.debug("tags:");
- var as = content.getElementsByTagName("a");
- var i=0;
- for(i=0;i<as.length;i++) {
- var a = as[i];
- //Zotero.debug(a.textContent);
- newItem.tags.push(a.textContent);
+ }
+// Zotero.debug("tags:\n"+tags);
+ }
+ pattern = /【Key words】\s*<span[^>]*>(.*?)<\/a>*<\/span>/;
+ if (pattern.test(page)) {
+ var tags = trimTags(pattern.exec(page)[1]).split(";");
+ for (var i=0; i<tags.length; i++) {
+ var tag = Zotero.Utilities.trim(tags[i]);
+ if (tag.length>0 && newItem.tags.indexOf(tag)<0) {
+ newItem.tags.push(tag);
}
}
+// Zotero.debug("tags:\n"+tags);
+ }
+// Zotero.debug(newItem.tags);
+
+ // 会议名称 & 会议录名称 & 会议地点 & 会议时间
+ pattern = /【会议名称】(.*?)\s*<\/li>/;
+ if (pattern.test(page)) {
+ var conferenceName = trimTags(pattern.exec(page)[1]);
+ newItem.conferenceName = conferenceName;
+// Zotero.debug("conferenceName: "+conferenceName);
+ }
+ pattern = /【会议录名称】(.*?)\s*<\/li>/;
+ if (pattern.test(page)) {
+ var proceedingsTitle = trimTags(pattern.exec(page)[1]);
+ newItem.proceedingsTitle = proceedingsTitle;
+// Zotero.debug("proceedingsTitle: "+proceedingsTitle);
+ }
+ pattern = /【会议地点】(.*?)\s*<\/li>/;
+ if (pattern.test(page)) {
+ var place = trimTags(pattern.exec(page)[1]);
+ newItem.place = place;
+// Zotero.debug("place: "+place);
+ }
+ pattern = /【会议时间】(.*?)\s*<\/li>/;
+ if (pattern.test(page)) {
+ var date = trimTags(pattern.exec(page)[1]);
+ newItem.date = date;
+// Zotero.debug("date: "+date);
+ }
+
+ newItem.complete();
+}
+
+// work for newspaperArticle
+function scrapeAndParse4(url) {
+// Zotero.debug("newspaperArticle");
+ var page = Zotero.Utilities.retrieveSource(url);
+ var pattern;
+
+ // 类型 & URL
+ var itemType = "newspaperArticle";
+ var newItem = new Zotero.Item(itemType);
+// Zotero.debug(url);
+ newItem.url = url;
+
+ // 标题
+ pattern = /<span id="chTitle">(.*?)<\/span>/;
+ if (pattern.test(page)) {
+ var title = trimTags(pattern.exec(page)[1]);
+ newItem.title = title;
+// Zotero.debug("title: "+title);
+ }
+
+ // 副标题/引题
+ var shortTitle;
+ pattern = /<p>【(?:副标题|引题)】(.*?)(?=<\/p>)/;
+ if (pattern.test(page)) {
+ shortTitle = pattern.exec(page)[1];
+// Zotero.debug("shortTitle: "+shortTitle);
+ newItem.shortTitle = Zotero.Utilities.trimInternal(shortTitle);
+ }
+// Zotero.debug(newItem.shortTitle);
+
+ // 作者
+ pattern = /【作\s*者】(.*?)<\/p>/;
+ if (pattern.test(page)) {
+ var authorNames = trimTags(pattern.exec(page)[1]).split(";");
+ for (var i=0; i<authorNames.length; i++) {
+ newItem.creators.push(
+ Zotero.Utilities.cleanAuthor(
+ Zotero.Utilities.trim(authorNames[i]),
+ "author", true));
+ }
+// Zotero.debug("authorNames:\n"+authorNames);
+ }
+
+ // 正文快照
+ var abst;
+ pattern = /<p>【正文快照】(.*?)(?=<\/p>)/;
+ if (pattern.test(page)) {
+ abst = pattern.exec(page)[1];
+// Zotero.debug("abst:\n"+abst);
+ newItem.abstractNote = Zotero.Utilities.trimInternal(abst);
+ }
+// Zotero.debug(newItem.abstractNote);
+
+ // 报纸名称 & DOI & 出版时间 & 版名 & 版号
+ pattern = /【报纸名称】\s*<[^>]*>(.*?)<\/a>/;
+ if (pattern.test(page)) {
+ var publicationTitle = trimTags(pattern.exec(page)[1]);
+ newItem.publicationTitle = publicationTitle;
+// Zotero.debug("publicationTitle: "+publicationTitle);
+ }
+ pattern = /【DOI】\s*(.*?)\s*<\/li>/;
+ if (pattern.test(page)) {
+ var doi = pattern.exec(page)[1];
+ newItem.DOI = doi;
+// Zotero.debug("doi: "+doi);
+ }
+ pattern = /【报纸日期】\s*(.*?)\s*<\/li>/;
+ if (pattern.test(page)) {
+ var date = pattern.exec(page)[1];
+ newItem.date = date;
+// Zotero.debug("date: "+date);
}
- // download pdf file
- // /html/body/table[4]/tbody/tr/td[2]/table/tbody/tr/td[2]/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td/table/tbody/tr/td[2]/a[2]
- //var as = table3.getElementsByTagName("a");
- //Zotero.debug(as[0].textContent);
- //Zotero.debug(as[1].textContent);
- //var pdfurlElmt = as[1];
- //if (pdfurlElmt) {
- // newItem.attachments.push({url:pdfurlElmt.href, title:"CNKI Full Text PDF", mimeType:"application/pdf", downloadable:true});
- //}
- //Zotero.debug(pdfurlElmt.href);
- //Zotero.debug("finished.");
+ pattern = /【版名】\s*(.*?)\s*<\/li>/;
+ if (pattern.test(page)) {
+ var section = pattern.exec(page)[1];
+ newItem.section = section;
+// Zotero.debug("section: "+section);
+ }
+ pattern = /【版号】\s*(.*?)\s*<\/li>/;
+ if (pattern.test(page)) {
+ var edition = pattern.exec(page)[1];
+ newItem.edition = edition;
+// Zotero.debug("edition: "+edition);
+ }
+
newItem.complete();
}
+// #########################
+// ##### API functions #####
+// #########################
+
+function detectWeb(doc, url) {
+ var pattern = /detail.aspx/;
+
+ if (pattern.test(url)) {
+ var code = detectCode(url);
+// Zotero.debug(code);
+ if (code == "CJFQ" || code == "CJFD") {
+ return "journalArticle";
+ } else if (code == "CDFD") {
+ return "thesis";
+ } else if (code == "CMFD" || code == "CLKM") {
+ return "thesis";
+ } else if (code == "CPFD") {
+ return "conferencePaper";
+ } else if (code == "CCND") {
+ return "newspaperArticle";
+ }
+ }
+
+ pattern = /brief/;
+ if (pattern.test(url)) {
+ return "multiple"
+ }
+
+ return false;
+}
+
function doWeb(doc, url) {
- var namespace = doc.documentElement.namespaceURI;
- var nsResolver = null;
+ var nsResolver = getResolver(doc);
+ var urls, tds;
Zotero.debug(url);
- if(detectWeb(doc, url) == "multiple") {
- //Zotero.debug("Enter multiple~");
+ if (detectWeb(doc, url) == "multiple") {
+// Zotero.debug("Enter multiple.");
// search page
var items = new Array();
- var tableRows = doc.evaluate('//table[4]/tbody/tr/td[4]/table[3]/tbody/tr/td/table/tbody/tr[2]/td/table/tbody/tr', doc, nsResolver, XPathResult.ANY_TYPE, null);
- //Zotero.debug("get table rows");
- var tableRow;
- //Zotero.debug("begin to fetch multiple title and link");
- while(tableRow = tableRows.iterateNext()) {
- //Zotero.debug(tableRow!=null);
- var title = "";
- var link = "";
- var as = tableRow.getElementsByTagName("a");
- for each(var a in as) {
- if(a.textContent) {
- // shoulde only one 'a' here.
- link = a.href;
- title = a.textContent;
- }
+ var xpath = '//iframe[@id="iframeResult"]';
+ var iframe = doc.evaluate(xpath, doc, nsResolver,
+ XPathResult.ANY_TYPE, null).iterateNext();
+ xpath = '//div[@class="GridTitleDiv"]';
+ if (iframe) {
+ var subdoc = iframe.contentDocument;
+ tds = subdoc.evaluate(xpath, subdoc, nsResolver,
+ XPathResult.ANY_TYPE, null);
+ } else {
+ tds = doc.evaluate(xpath, doc, nsResolver,
+ XPathResult.ANY_TYPE, null);
+ }
+
+ var td = tds.iterateNext();
+ var link;
+ var title;
+ while (td) {
+ var a = td.getElementsByTagName("a")[0];
+ title = Zotero.Utilities.cleanTags(a.textContent);
+ pattern = /;(.*)/;
+ if (pattern.test(title)) {
+ title = pattern.exec(title)[1];
}
- //Zotero.debug(title);
- //Zotero.debug(link);
- if(link) {
+ link = a.getAttribute("href");
+ if (link) {
+ pattern = /^(http:\/\/.*?)\//;
+ link = pattern.exec(url)[1] + link;
items[link] = Zotero.Utilities.trimInternal(title);
+// Zotero.debug("title:"+title);
+// Zotero.debug("link:"+link);
}
+ td = tds.iterateNext();
}
- // 让用户选择要保存哪些文献
- items = Zotero.selectItems(items);
- if(!items) return true;
- //Zotero.debug("go on processing.");
+// Zotero.debug(items);
+ if (items.__count__) {
+ // 让用户选择要保存哪些文献
+ items = Zotero.selectItems(items);
+ if (!items) return true;
- var urls = new Array();
- for(var url in items) {
- urls.push(url);
+ urls = new Array();
+ for (var url in items) {
+ urls.push(url);
+ }
}
} else {
- var urls = [url];
+ urls = [url];
+ }
+
+ if (urls) {
+// Zotero.debug(urls);
+
+ for (var i=0; i<urls.length; i++) {
+ var type = detectWeb(null, urls[i]);
+// Zotero.debug(type);
+ if (type == "journalArticle") {
+ scrapeAndParse1(urls[i]);
+ } else if (type == "thesis") {
+ scrapeAndParse2(urls[i]);
+ } else if (type == "conferencePaper") {
+ scrapeAndParse3(urls[i]);
+ } else if (type == "newspaperArticle") {
+ scrapeAndParse4(urls[i]);
+ } else {
+ Zotero.debug("Not supported type.");
+ }
+ }
}
- //Zotero.debug(urls);
- // 下面对每条url进行解析
- Zotero.Utilities.processDocuments(urls, scrape, function() { Zotero.done(); },null);
- Zotero.wait();
}