rtfScan.js (21301B)
1 /* 2 ***** BEGIN LICENSE BLOCK ***** 3 4 Copyright © 2009 Center for History and New Media 5 George Mason University, Fairfax, Virginia, USA 6 http://zotero.org 7 8 This file is part of Zotero. 9 10 Zotero is free software: you can redistribute it and/or modify 11 it under the terms of the GNU Affero General Public License as published by 12 the Free Software Foundation, either version 3 of the License, or 13 (at your option) any later version. 14 15 Zotero is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU Affero General Public License for more details. 19 20 You should have received a copy of the GNU Affero General Public License 21 along with Zotero. If not, see <http://www.gnu.org/licenses/>. 22 23 ***** END LICENSE BLOCK ***** 24 */ 25 26 /** 27 * @fileOverview Tools for automatically retrieving a citation for the given PDF 28 */ 29 30 31 /** 32 * Front end for recognizing PDFs 33 * @namespace 34 */ 35 var Zotero_RTFScan = new function() { 36 const ACCEPT_ICON = "chrome://zotero/skin/rtfscan-accept.png"; 37 const LINK_ICON = "chrome://zotero/skin/rtfscan-link.png"; 38 const BIBLIOGRAPHY_PLACEHOLDER = "\\{Bibliography\\}"; 39 40 var inputFile = null, outputFile = null; 41 var unmappedCitationsItem, ambiguousCitationsItem, mappedCitationsItem; 42 var unmappedCitationsChildren, ambiguousCitationsChildren, mappedCitationsChildren; 43 var citations, citationItemIDs, allCitedItemIDs, contents; 44 45 /** INTRO PAGE UI **/ 46 47 /** 48 * Called when the first page is shown; loads target file from preference, if one is set 49 */ 50 this.introPageShowing = function() { 51 var path = Zotero.Prefs.get("rtfScan.lastInputFile"); 52 if(path) { 53 inputFile = Components.classes["@mozilla.org/file/local;1"].createInstance(Components.interfaces.nsILocalFile); 54 inputFile.initWithPath(path); 55 } 56 var path = Zotero.Prefs.get("rtfScan.lastOutputFile"); 57 if(path) { 58 outputFile = Components.classes["@mozilla.org/file/local;1"].createInstance(Components.interfaces.nsILocalFile); 59 outputFile.initWithPath(path); 60 } 61 _updatePath(); 62 document.getElementById("choose-input-file").focus(); 63 } 64 65 /** 66 * Called when the first page is hidden 67 */ 68 this.introPageAdvanced = function() { 69 Zotero.Prefs.set("rtfScan.lastInputFile", inputFile.path); 70 Zotero.Prefs.set("rtfScan.lastOutputFile", outputFile.path); 71 } 72 73 /** 74 * Called to select the file to be processed 75 */ 76 this.chooseInputFile = function() { 77 // display file picker 78 const nsIFilePicker = Components.interfaces.nsIFilePicker; 79 var fp = Components.classes["@mozilla.org/filepicker;1"] 80 .createInstance(nsIFilePicker); 81 fp.init(window, Zotero.getString("rtfScan.openTitle"), nsIFilePicker.modeOpen); 82 83 fp.appendFilters(nsIFilePicker.filterAll); 84 fp.appendFilter(Zotero.getString("rtfScan.rtf"), "*.rtf"); 85 86 var rv = fp.show(); 87 if (rv == nsIFilePicker.returnOK || rv == nsIFilePicker.returnReplace) { 88 inputFile = fp.file; 89 _updatePath(); 90 } 91 } 92 93 /** 94 * Called to select the output file 95 */ 96 this.chooseOutputFile = function() { 97 const nsIFilePicker = Components.interfaces.nsIFilePicker; 98 var fp = Components.classes["@mozilla.org/filepicker;1"] 99 .createInstance(nsIFilePicker); 100 fp.init(window, Zotero.getString("rtfScan.saveTitle"), nsIFilePicker.modeSave); 101 fp.appendFilter(Zotero.getString("rtfScan.rtf"), "*.rtf"); 102 if(inputFile) { 103 var leafName = inputFile.leafName; 104 var dotIndex = leafName.lastIndexOf("."); 105 if(dotIndex != -1) { 106 leafName = leafName.substr(0, dotIndex); 107 } 108 fp.defaultString = leafName+" "+Zotero.getString("rtfScan.scannedFileSuffix")+".rtf"; 109 } else { 110 fp.defaultString = "Untitled.rtf"; 111 } 112 113 var rv = fp.show(); 114 if (rv == nsIFilePicker.returnOK || rv == nsIFilePicker.returnReplace) { 115 outputFile = fp.file; 116 _updatePath(); 117 } 118 } 119 120 /** 121 * Called to update the path label in the dialog box 122 * @private 123 */ 124 function _updatePath() { 125 document.documentElement.canAdvance = inputFile && outputFile; 126 if(inputFile) document.getElementById("input-path").value = inputFile.path; 127 if(outputFile) document.getElementById("output-path").value = outputFile.path; 128 } 129 130 /** SCAN PAGE UI **/ 131 132 /** 133 * Called when second page is shown. 134 */ 135 this.scanPageShowing = function() { 136 // can't advance 137 document.documentElement.canAdvance = false; 138 139 // wait a ms so that UI thread gets updated 140 window.setTimeout(function() { _scanRTF() }, 1); 141 } 142 143 /** 144 * Scans file for citations, then proceeds to next wizard page. 145 */ 146 var _scanRTF = Zotero.Promise.coroutine(function* () { 147 // set up globals 148 citations = []; 149 citationItemIDs = {}; 150 151 unmappedCitationsItem = document.getElementById("unmapped-citations-item"); 152 ambiguousCitationsItem = document.getElementById("ambiguous-citations-item"); 153 mappedCitationsItem = document.getElementById("mapped-citations-item"); 154 unmappedCitationsChildren = document.getElementById("unmapped-citations-children"); 155 ambiguousCitationsChildren = document.getElementById("ambiguous-citations-children"); 156 mappedCitationsChildren = document.getElementById("mapped-citations-children"); 157 158 // set up regular expressions 159 // this assumes that names are >=2 chars or only capital initials and that there are no 160 // more than 4 names 161 const nameRe = "(?:[^ .,;]{2,} |[A-Z].? ?){0,3}[A-Z][^ .,;]+"; 162 const creatorRe = '((?:(?:'+nameRe+', )*'+nameRe+'(?:,? and|,? \\&|,) )?'+nameRe+')(,? et al\\.?)?'; 163 // TODO: localize "and" term 164 const creatorSplitRe = /(?:,| *(?:and|\&)) +/g; 165 var citationRe = new RegExp('(\\\\\\{|; )('+creatorRe+',? (?:"([^"]+)(?:,"|",) )?([0-9]{4})[a-z]?)(?:,(?: pp?\.?)? ([^ )]+))?(?=;|\\\\\\})|(([A-Z][^ .,;]+)(,? et al\\.?)? (\\\\\\{([0-9]{4})[a-z]?\\\\\\}))', "gm"); 166 167 // read through RTF file and display items as they're found 168 // we could read the file in chunks, but unless people start having memory issues, it's 169 // probably faster and definitely simpler if we don't 170 contents = Zotero.File.getContents(inputFile).replace(/([^\\\r])\r?\n/, "$1 ").replace("\\'92", "'", "g").replace("\\rquote ", "’"); 171 var m; 172 var lastCitation = false; 173 while((m = citationRe.exec(contents))) { 174 // determine whether suppressed or standard regular expression was used 175 if(m[2]) { // standard parenthetical 176 var citationString = m[2]; 177 var creators = m[3]; 178 var etAl = !!m[4]; 179 var title = m[5]; 180 var date = m[6]; 181 var pages = m[7]; 182 var start = citationRe.lastIndex-m[0].length; 183 var end = citationRe.lastIndex+2; 184 } else { // suppressed 185 var citationString = m[8]; 186 var creators = m[9]; 187 var etAl = !!m[10]; 188 var title = false; 189 var date = m[12]; 190 var pages = false; 191 var start = citationRe.lastIndex-m[11].length; 192 var end = citationRe.lastIndex; 193 } 194 citationString = citationString.replace("\\{", "{", "g").replace("\\}", "}", "g"); 195 var suppressAuthor = !m[2]; 196 197 if(lastCitation && lastCitation.end >= start) { 198 // if this citation is just an extension of the last, add items to it 199 lastCitation.citationStrings.push(citationString); 200 lastCitation.pages.push(pages); 201 lastCitation.end = end; 202 } else { 203 // otherwise, add another citation 204 var lastCitation = {"citationStrings":[citationString], "pages":[pages], "start":start, 205 "end":end, "suppressAuthor":suppressAuthor}; 206 citations.push(lastCitation); 207 } 208 209 // only add each citation once 210 if(citationItemIDs[citationString]) continue; 211 Zotero.debug("Found citation "+citationString); 212 213 // for each individual match, look for an item in the database 214 var s = new Zotero.Search; 215 creators = creators.replace(".", ""); 216 // TODO: localize "et al." term 217 creators = creators.split(creatorSplitRe); 218 219 for(var i=0; i<creators.length; i++) { 220 if(!creators[i]) { 221 if(i == creators.length-1) { 222 break; 223 } else { 224 creators.splice(i, 1); 225 } 226 } 227 228 var spaceIndex = creators[i].lastIndexOf(" "); 229 var lastName = spaceIndex == -1 ? creators[i] : creators[i].substr(spaceIndex+1); 230 s.addCondition("lastName", "contains", lastName); 231 } 232 if(title) s.addCondition("title", "contains", title); 233 s.addCondition("date", "is", date); 234 var ids = yield s.search(); 235 Zotero.debug("Mapped to "+ids); 236 citationItemIDs[citationString] = ids; 237 238 if(!ids) { // no mapping found 239 unmappedCitationsChildren.appendChild(_generateItem(citationString, "")); 240 unmappedCitationsItem.hidden = undefined; 241 } else { // some mapping found 242 var items = yield Zotero.Items.getAsync(ids); 243 if(items.length > 1) { 244 // check to see how well the author list matches the citation 245 var matchedItems = []; 246 for(var i=0; i<items.length; i++) { 247 yield items[i].loadDataType('creators'); 248 if(_matchesItemCreators(creators, items[i])) matchedItems.push(items[i]); 249 } 250 251 if(matchedItems.length != 0) items = matchedItems; 252 } 253 254 if(items.length == 1) { // only one mapping 255 mappedCitationsChildren.appendChild(_generateItem(citationString, items[0].getField("title"))); 256 citationItemIDs[citationString] = [items[0].id]; 257 mappedCitationsItem.hidden = undefined; 258 } else { // ambiguous mapping 259 var treeitem = _generateItem(citationString, ""); 260 261 // generate child items 262 var treeitemChildren = document.createElement('treechildren'); 263 treeitem.appendChild(treeitemChildren); 264 for(var i=0; i<items.length; i++) { 265 treeitemChildren.appendChild(_generateItem("", items[i].getField("title"), true)); 266 } 267 268 treeitem.setAttribute("container", "true"); 269 treeitem.setAttribute("open", "true"); 270 ambiguousCitationsChildren.appendChild(treeitem); 271 ambiguousCitationsItem.hidden = undefined; 272 } 273 } 274 } 275 276 // when scanning is complete, go to citations page 277 document.documentElement.canAdvance = true; 278 document.documentElement.advance(); 279 }); 280 281 function _generateItem(citationString, itemName, accept) { 282 var treeitem = document.createElement('treeitem'); 283 var treerow = document.createElement('treerow'); 284 285 var treecell = document.createElement('treecell'); 286 treecell.setAttribute("label", citationString); 287 treerow.appendChild(treecell); 288 289 var treecell = document.createElement('treecell'); 290 treecell.setAttribute("label", itemName); 291 treerow.appendChild(treecell); 292 293 var treecell = document.createElement('treecell'); 294 treecell.setAttribute("src", accept ? ACCEPT_ICON : LINK_ICON); 295 treerow.appendChild(treecell); 296 297 treeitem.appendChild(treerow); 298 return treeitem; 299 } 300 301 function _matchesItemCreators(creators, item, etAl) { 302 var itemCreators = item.getCreators(); 303 var primaryCreators = []; 304 var primaryCreatorTypeID = Zotero.CreatorTypes.getPrimaryIDForType(item.itemTypeID); 305 306 // use only primary creators if primary creators exist 307 for(var i=0; i<itemCreators.length; i++) { 308 if(itemCreators[i].creatorTypeID == primaryCreatorTypeID) { 309 primaryCreators.push(itemCreators[i]); 310 } 311 } 312 // if primaryCreators matches the creator list length, or if et al is being used, use only 313 // primary creators 314 if(primaryCreators.length == creators.length || etAl) itemCreators = primaryCreators; 315 316 // for us to have an exact match, either the citation creator list length has to match the 317 // item creator list length, or et al has to be used 318 if(itemCreators.length == creators.length || (etAl && itemCreators.length > creators.length)) { 319 var matched = true; 320 for(var i=0; i<creators.length; i++) { 321 // check each item creator to see if it matches 322 matched = matched && _matchesItemCreator(creators[i], itemCreators[i]); 323 if(!matched) break; 324 } 325 return matched; 326 } 327 328 return false; 329 } 330 331 function _matchesItemCreator(creator, itemCreator) { 332 // make sure last name matches 333 var lowerLast = itemCreator.lastName.toLowerCase(); 334 if(lowerLast != creator.substr(-lowerLast.length).toLowerCase()) return false; 335 336 // make sure first name matches, if it exists 337 if(creator.length > lowerLast.length) { 338 var firstName = Zotero.Utilities.trim(creator.substr(0, creator.length-lowerLast.length)); 339 if(firstName.length) { 340 // check to see whether the first name is all initials 341 const initialRe = /^(?:[A-Z]\.? ?)+$/; 342 var m = initialRe.exec(firstName); 343 if(m) { 344 var initials = firstName.replace(/[^A-Z]/g, ""); 345 var itemInitials = itemCreator.firstName.split(/ +/g) 346 .map(name => name[0].toUpperCase()) 347 .join(""); 348 if(initials != itemInitials) return false; 349 } else { 350 // not all initials; verify that the first name matches 351 var firstWord = firstName.substr(0, itemCreator.firstName).toLowerCase(); 352 var itemFirstWord = itemCreator.firstName.substr(0, itemCreator.firstName.indexOf(" ")).toLowerCase(); 353 if(firstWord != itemFirstWord) return false; 354 } 355 } 356 } 357 358 return true; 359 } 360 361 /** CITATIONS PAGE UI **/ 362 363 /** 364 * Called when citations page is shown to determine whether user can immediately advance. 365 */ 366 this.citationsPageShowing = function() { 367 _refreshCanAdvance(); 368 } 369 370 /** 371 * Called when the citations page is rewound. Removes all citations from the list, clears 372 * globals, and returns to intro page. 373 */ 374 this.citationsPageRewound = function() { 375 // skip back to intro page 376 document.documentElement.currentPage = document.getElementById('intro-page'); 377 378 // remove children from tree 379 while(unmappedCitationsChildren.hasChildNodes()) { 380 unmappedCitationsChildren.removeChild(unmappedCitationsChildren.firstChild); 381 } 382 while(ambiguousCitationsChildren.hasChildNodes()) { 383 ambiguousCitationsChildren.removeChild(ambiguousCitationsChildren.firstChild); 384 } 385 while(mappedCitationsChildren.hasChildNodes()) { 386 mappedCitationsChildren.removeChild(mappedCitationsChildren.firstChild); 387 } 388 // hide headings 389 unmappedCitationsItem.hidden = ambiguousCitationsItem.hidden = mappedCitationsItem.hidden = true; 390 391 return false; 392 } 393 394 /** 395 * Called when a tree item is clicked to remap a citation, or accept a suggestion for an 396 * ambiguous citation 397 */ 398 this.treeClick = function(event) { 399 var tree = document.getElementById("tree"); 400 401 // get clicked cell 402 var row = { }, col = { }, child = { }; 403 tree.treeBoxObject.getCellAt(event.clientX, event.clientY, row, col, child); 404 405 // figure out which item this corresponds to 406 row = row.value; 407 var level = tree.view.getLevel(row); 408 if(col.value.index == 2 && level > 0) { 409 var iconColumn = col.value; 410 var itemNameColumn = iconColumn.getPrevious(); 411 var citationColumn = itemNameColumn.getPrevious(); 412 413 if(level == 2) { // ambiguous citation item 414 // get relevant information 415 var parentIndex = tree.view.getParentIndex(row); 416 var citation = tree.view.getCellText(parentIndex, citationColumn); 417 var itemName = tree.view.getCellText(row, itemNameColumn); 418 419 // update item name on parent and delete children 420 tree.view.setCellText(parentIndex, itemNameColumn, itemName); 421 var treeitem = tree.view.getItemAtIndex(row); 422 treeitem.parentNode.parentNode.removeChild(treeitem.parentNode); 423 424 // update array 425 citationItemIDs[citation] = [citationItemIDs[citation][row-parentIndex-1]]; 426 } else { // mapped or unmapped citation, or ambiguous citation parent 427 var citation = tree.view.getCellText(row, citationColumn); 428 var io = {singleSelection:true}; 429 if(citationItemIDs[citation] && citationItemIDs[citation].length == 1) { // mapped citation 430 // specify that item should be selected in window 431 io.select = citationItemIDs[citation]; 432 } 433 434 window.openDialog('chrome://zotero/content/selectItemsDialog.xul', '', 'chrome,modal', io); 435 436 if(io.dataOut && io.dataOut.length) { 437 var selectedItemID = io.dataOut[0]; 438 var selectedItem = Zotero.Items.get(selectedItemID); 439 440 var treeitem = tree.view.getItemAtIndex(row); 441 442 // remove any children (if ambiguous) 443 var children = treeitem.getElementsByTagName("treechildren"); 444 if(children.length) treeitem.removeChild(children[0]); 445 446 // update item name 447 tree.view.setCellText(row, itemNameColumn, selectedItem.getField("title")); 448 449 // update array 450 citationItemIDs[citation] = [selectedItemID]; 451 } 452 } 453 } 454 _refreshCanAdvance(); 455 } 456 457 /** 458 * Determines whether the button to advance the wizard should be enabled or not based on whether 459 * unmapped citations exist, and sets the status appropriately 460 */ 461 function _refreshCanAdvance() { 462 var canAdvance = true; 463 for (let itemList of citationItemIDs) { 464 if(itemList.length != 1) { 465 canAdvance = false; 466 break; 467 } 468 } 469 470 document.documentElement.canAdvance = canAdvance; 471 } 472 473 /** STYLE PAGE UI **/ 474 475 /** 476 * Called when style page is shown to add styles to listbox. 477 */ 478 this.stylePageShowing = function() { 479 Zotero_File_Interface_Bibliography.init(); 480 } 481 482 /** 483 * Called when style page is hidden to save preferences. 484 */ 485 this.stylePageAdvanced = function() { 486 Zotero.Prefs.set("export.lastStyle", document.getElementById("style-listbox").selectedItem.value); 487 } 488 489 /** FORMAT PAGE UI **/ 490 491 this.formatPageShowing = function() { 492 // can't advance 493 document.documentElement.canAdvance = false; 494 495 // wait a ms so that UI thread gets updated 496 window.setTimeout(function() { _formatRTF() }, 1); 497 } 498 499 function _formatRTF() { 500 // load style and create ItemSet with all items 501 var zStyle = Zotero.Styles.get(document.getElementById("style-listbox").value) 502 var locale = document.getElementById("locale-menu").value; 503 var style = zStyle.getCiteProc(locale); 504 style.setOutputFormat("rtf"); 505 var isNote = style.class == "note"; 506 507 // create citations 508 var k = 0; 509 var cslCitations = []; 510 var itemIDs = {}; 511 var shouldBeSubsequent = {}; 512 for(var i=0; i<citations.length; i++) { 513 var citation = citations[i]; 514 var cslCitation = {"citationItems":[], "properties":{}}; 515 if(isNote) { 516 cslCitation.properties.noteIndex = i; 517 } 518 519 // create citation items 520 for(var j=0; j<citation.citationStrings.length; j++) { 521 var citationItem = {}; 522 citationItem.id = citationItemIDs[citation.citationStrings[j]][0]; 523 itemIDs[citationItem.id] = true; 524 citationItem.locator = citation.pages[j]; 525 citationItem.label = "page"; 526 citationItem["suppress-author"] = citation.suppressAuthor && !isNote; 527 cslCitation.citationItems.push(citationItem); 528 } 529 530 cslCitations.push(cslCitation); 531 } 532 Zotero.debug(cslCitations); 533 534 itemIDs = Object.keys(itemIDs); 535 Zotero.debug(itemIDs); 536 537 // prepare the list of rendered citations 538 var citationResults = style.rebuildProcessorState(cslCitations, "rtf"); 539 540 // format citations 541 var contentArray = []; 542 var lastEnd = 0; 543 for(var i=0; i<citations.length; i++) { 544 var citation = citationResults[i][2]; 545 Zotero.debug("Formatted "+citation); 546 547 // if using notes, we might have to move the note after the punctuation 548 if(isNote && citations[i].start != 0 && contents[citations[i].start-1] == " ") { 549 contentArray.push(contents.substring(lastEnd, citations[i].start-1)); 550 } else { 551 contentArray.push(contents.substring(lastEnd, citations[i].start)); 552 } 553 554 lastEnd = citations[i].end; 555 if(isNote && citations[i].end < contents.length && ".,!?".indexOf(contents[citations[i].end]) !== -1) { 556 contentArray.push(contents[citations[i].end]); 557 lastEnd++; 558 } 559 560 if(isNote) { 561 if(document.getElementById("displayAs").selectedIndex) { // endnotes 562 contentArray.push("{\\super\\chftn}\\ftnbj {\\footnote\\ftnalt {\\super\\chftn } "+citation+"}"); 563 } else { // footnotes 564 contentArray.push("{\\super\\chftn}\\ftnbj {\\footnote {\\super\\chftn } "+citation+"}"); 565 } 566 } else { 567 contentArray.push(citation); 568 } 569 } 570 contentArray.push(contents.substring(lastEnd)); 571 contents = contentArray.join(""); 572 573 // add bibliography 574 if(zStyle.hasBibliography) { 575 var bibliography = Zotero.Cite.makeFormattedBibliography(style, "rtf"); 576 bibliography = bibliography.substring(5, bibliography.length-1); 577 // fix line breaks 578 var linebreak = "\r\n"; 579 if(contents.indexOf("\r\n") == -1) { 580 bibliography = bibliography.replace("\r\n", "\n", "g"); 581 linebreak = "\n"; 582 } 583 584 if(contents.indexOf(BIBLIOGRAPHY_PLACEHOLDER) !== -1) { 585 contents = contents.replace(BIBLIOGRAPHY_PLACEHOLDER, bibliography); 586 } else { 587 // add two newlines before bibliography 588 bibliography = linebreak+"\\"+linebreak+"\\"+linebreak+bibliography; 589 590 // add bibliography automatically inside last set of brackets closed 591 const bracketRe = /^\{+/; 592 var m = bracketRe.exec(contents); 593 if(m) { 594 var closeBracketRe = new RegExp("(\\}{"+m[0].length+"}\\s*)$"); 595 contents = contents.replace(closeBracketRe, bibliography+"$1"); 596 } else { 597 contents += bibliography; 598 } 599 } 600 } 601 602 Zotero.File.putContents(outputFile, contents); 603 604 // save locale 605 if (!document.getElementById("locale-menu").disabled) { 606 Zotero.Prefs.set("export.lastLocale", locale); 607 } 608 609 document.documentElement.canAdvance = true; 610 document.documentElement.advance(); 611 } 612 }