duplicates.js (14139B)
1 /* 2 ***** BEGIN LICENSE BLOCK ***** 3 4 Copyright © 2009 Center for History and New Media 5 George Mason University, Fairfax, Virginia, USA 6 http://zotero.org 7 8 This file is part of Zotero. 9 10 Zotero is free software: you can redistribute it and/or modify 11 it under the terms of the GNU Affero General Public License as published by 12 the Free Software Foundation, either version 3 of the License, or 13 (at your option) any later version. 14 15 Zotero is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU Affero General Public License for more details. 19 20 You should have received a copy of the GNU Affero General Public License 21 along with Zotero. If not, see <http://www.gnu.org/licenses/>. 22 23 ***** END LICENSE BLOCK ***** 24 */ 25 26 Zotero.Duplicates = function (libraryID) { 27 if (typeof libraryID == 'undefined') { 28 throw ("libraryID not provided in Zotero.Duplicates constructor"); 29 } 30 31 if (!libraryID) { 32 libraryID = Zotero.Libraries.userLibraryID; 33 } 34 35 this._libraryID = libraryID; 36 } 37 38 39 Zotero.Duplicates.prototype.__defineGetter__('name', function () { return Zotero.getString('pane.collections.duplicate'); }); 40 Zotero.Duplicates.prototype.__defineGetter__('libraryID', function () { return this._libraryID; }); 41 42 /** 43 * Get duplicates, populate a temporary table, and return a search based 44 * on that table 45 * 46 * @return {Zotero.Search} 47 */ 48 Zotero.Duplicates.prototype.getSearchObject = async function () { 49 var table = 'tmpDuplicates_' + Zotero.Utilities.randomString(); 50 51 await this._findDuplicates(); 52 var ids = this._sets.findAll(true); 53 54 // Zotero.CollectionTreeRow::getSearchObject() extracts the table name and creates an 55 // unload listener that drops the table when the ItemTreeView is unregistered 56 var sql = `CREATE TEMPORARY TABLE ${table} (id INTEGER PRIMARY KEY)`; 57 await Zotero.DB.queryAsync(sql); 58 59 if (ids.length) { 60 Zotero.debug("Inserting rows into temp table"); 61 sql = `INSERT INTO ${table} VALUES `; 62 await Zotero.Utilities.Internal.forEachChunkAsync( 63 ids, 64 Zotero.DB.MAX_BOUND_PARAMETERS, 65 async function (chunk) { 66 let idStr = '(' + chunk.join('), (') + ')'; 67 await Zotero.DB.queryAsync(sql + idStr, false, { debug: false }); 68 } 69 ); 70 Zotero.debug("Done"); 71 } 72 else { 73 Zotero.debug("No duplicates found"); 74 } 75 76 var s = new Zotero.Search; 77 s.libraryID = this._libraryID; 78 s.addCondition('tempTable', 'is', table); 79 return s; 80 }; 81 82 83 /** 84 * Finds all items in the same set as a given item 85 * 86 * @param {Integer} itemID 87 * @return {Integer[]} Array of itemIDs 88 */ 89 Zotero.Duplicates.prototype.getSetItemsByItemID = function (itemID) { 90 return this._sets.findAllInSet(this._getObjectFromID(itemID), true); 91 } 92 93 94 Zotero.Duplicates.prototype._getObjectFromID = function (id) { 95 return { 96 get id() { return id; } 97 } 98 } 99 100 101 Zotero.Duplicates.prototype._findDuplicates = Zotero.Promise.coroutine(function* () { 102 Zotero.debug("Finding duplicates"); 103 104 var start = Date.now(); 105 106 var self = this; 107 108 this._sets = new Zotero.DisjointSetForest; 109 var sets = this._sets; 110 111 function normalizeString(str) { 112 // Make sure we have a string and not an integer 113 str = str + ""; 114 115 if (str === "") { 116 return ""; 117 } 118 119 str = Zotero.Utilities.removeDiacritics(str) 120 .replace(/[ !-/:-@[-`{-~]+/g, ' ') // Convert (ASCII) punctuation to spaces 121 .trim() 122 .toLowerCase(); 123 124 return str; 125 } 126 127 function sortByValue(a, b) { 128 if((a.value === null && b.value !== null) 129 || (a.value === undefined && b.value !== undefined) 130 || a.value < b.value) { 131 return -1; 132 } 133 134 if(a.value === b.value) return 0; 135 136 return 1; 137 } 138 139 /** 140 * @param {Function} compareRows Comparison function, if not exact match 141 * @param {Boolean} reprocessMatches Compare every row against every other, 142 * without skipping ahead to the last match. 143 * This is necessary for multi-dimensional 144 * matches such as title + at least one creator. 145 * Without it, only one set of matches would be 146 * found per matching title, since items with 147 * different creators wouldn't match the first 148 * set and the next start row would be a 149 * different title. 150 */ 151 function processRows(rows, compareRows, reprocessMatches) { 152 if (!rows.length) { 153 return; 154 } 155 156 for (var i = 0, len = rows.length; i < len; i++) { 157 var j = i + 1, lastMatch = false; 158 while (j < len) { 159 if (compareRows) { 160 var match = compareRows(rows[i], rows[j]); 161 // Not a match, and don't try any more with this i value 162 if (match == -1) { 163 break; 164 } 165 // Not a match, but keep looking 166 if (match == 0) { 167 j++; 168 continue; 169 } 170 } 171 // If no comparison function, check for exact match 172 else { 173 if (!rows[i].value || !rows[j].value 174 || (rows[i].value !== rows[j].value) 175 ) { 176 break; 177 } 178 } 179 180 sets.union( 181 self._getObjectFromID(rows[i].itemID), 182 self._getObjectFromID(rows[j].itemID) 183 ); 184 185 lastMatch = j; 186 j++; 187 } 188 if (!reprocessMatches && lastMatch) { 189 i = lastMatch; 190 } 191 } 192 } 193 194 // Match books by ISBN 195 var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) " 196 + "JOIN itemDataValues USING (valueID) " 197 + "WHERE libraryID=? AND itemTypeID=? AND fieldID=? " 198 + "AND itemID NOT IN (SELECT itemID FROM deletedItems)"; 199 var rows = yield Zotero.DB.queryAsync( 200 sql, 201 [ 202 this._libraryID, 203 Zotero.ItemTypes.getID('book'), 204 Zotero.ItemFields.getID('ISBN') 205 ] 206 ); 207 var isbnCache = {}; 208 if (rows.length) { 209 let newRows = []; 210 for (let i = 0; i < rows.length; i++) { 211 let row = rows[i]; 212 let newVal = Zotero.Utilities.cleanISBN('' + row.value); 213 if (!newVal) continue; 214 isbnCache[row.itemID] = newVal; 215 newRows.push({ 216 itemID: row.itemID, 217 value: newVal 218 }); 219 } 220 newRows.sort(sortByValue); 221 processRows(newRows); 222 } 223 224 // DOI 225 var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) " 226 + "JOIN itemDataValues USING (valueID) " 227 + "WHERE libraryID=? AND fieldID=? AND value LIKE ? " 228 + "AND itemID NOT IN (SELECT itemID FROM deletedItems)"; 229 var rows = yield Zotero.DB.queryAsync( 230 sql, 231 [ 232 this._libraryID, 233 Zotero.ItemFields.getID('DOI'), 234 '10.%' 235 ] 236 ); 237 var doiCache = {}; 238 if (rows.length) { 239 let newRows = []; 240 for (let i = 0; i < rows.length; i++) { 241 let row = rows[i]; 242 // DOIs are case insensitive 243 let newVal = (row.value + '').trim().toUpperCase(); 244 doiCache[row.itemID] = newVal; 245 newRows.push({ 246 itemID: row.itemID, 247 value: newVal 248 }); 249 } 250 newRows.sort(sortByValue); 251 processRows(newRows); 252 } 253 254 // Get years 255 var dateFields = [Zotero.ItemFields.getID('date')].concat( 256 Zotero.ItemFields.getTypeFieldsFromBase('date') 257 ); 258 var sql = "SELECT itemID, SUBSTR(value, 1, 4) AS year FROM items " 259 + "JOIN itemData USING (itemID) " 260 + "JOIN itemDataValues USING (valueID) " 261 + "WHERE libraryID=? AND fieldID IN (" 262 + dateFields.map(() => '?').join() + ") " 263 + "AND SUBSTR(value, 1, 4) != '0000' " 264 + "AND itemID NOT IN (SELECT itemID FROM deletedItems) " 265 + "ORDER BY value"; 266 var rows = yield Zotero.DB.queryAsync(sql, [this._libraryID].concat(dateFields)); 267 var yearCache = {}; 268 for (let i = 0; i < rows.length; i++) { 269 let row = rows[i]; 270 yearCache[row.itemID] = row.year; 271 } 272 273 // Match on normalized title 274 var titleIDs = Zotero.ItemFields.getTypeFieldsFromBase('title'); 275 titleIDs.push(Zotero.ItemFields.getID('title')); 276 var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) " 277 + "JOIN itemDataValues USING (valueID) " 278 + "WHERE libraryID=? AND fieldID IN " 279 + "(" + titleIDs.join(', ') + ") " 280 + "AND itemTypeID NOT IN (1, 14) " 281 + "AND itemID NOT IN (SELECT itemID FROM deletedItems)"; 282 var rows = yield Zotero.DB.queryAsync(sql, [this._libraryID]); 283 if (rows.length) { 284 //normalize all values ahead of time 285 rows = rows.map(function(row) { 286 return { 287 itemID: row.itemID, 288 value: normalizeString(row.value) 289 }; 290 }); 291 //sort rows by normalized values 292 rows.sort(sortByValue); 293 294 // Get all creators and separate by itemID 295 // 296 // We won't need all of these, but otherwise we would have to make processRows() 297 // asynchronous, which would be too slow 298 let creatorRowsCache = {}; 299 let sql = "SELECT itemID, lastName, firstName, fieldMode FROM items " 300 + "JOIN itemCreators USING (itemID) " 301 + "JOIN creators USING (creatorID) " 302 + "WHERE libraryID=? AND itemTypeID NOT IN (1, 14) AND " 303 + "itemID NOT IN (SELECT itemID FROM deletedItems)" 304 + "ORDER BY itemID, orderIndex"; 305 let creatorRows = yield Zotero.DB.queryAsync(sql, this._libraryID); 306 let lastItemID; 307 let itemCreators = []; 308 for (let i = 0; i < creatorRows.length; i++) { 309 let row = creatorRows[i]; 310 if (lastItemID && row.itemID != lastItemID) { 311 if (itemCreators.length) { 312 creatorRowsCache[lastItemID] = itemCreators; 313 itemCreators = []; 314 } 315 } 316 317 lastItemID = row.itemID; 318 319 itemCreators.push({ 320 lastName: normalizeString(row.lastName), 321 firstInitial: row.fieldMode == 0 ? normalizeString(row.firstName).charAt(0) : false 322 }); 323 } 324 // Add final item creators 325 if (itemCreators.length) { 326 creatorRowsCache[lastItemID] = itemCreators; 327 } 328 329 processRows(rows, function (a, b) { 330 var aTitle = a.value; 331 var bTitle = b.value; 332 333 // If we stripped one of the strings completely, we can't compare them 334 if(!aTitle || !bTitle) { 335 return -1; 336 } 337 338 if (aTitle !== bTitle) { 339 return -1; //everything is sorted by title, so if this mismatches, everything following will too 340 } 341 342 // If both items have a DOI and they don't match, it's not a dupe 343 if (typeof doiCache[a.itemID] != 'undefined' 344 && typeof doiCache[b.itemID] != 'undefined' 345 && doiCache[a.itemID] != doiCache[b.itemID]) { 346 return 0; 347 } 348 349 // If both items have an ISBN and they don't match, it's not a dupe 350 if (typeof isbnCache[a.itemID] != 'undefined' 351 && typeof isbnCache[b.itemID] != 'undefined' 352 && isbnCache[a.itemID] != isbnCache[b.itemID]) { 353 return 0; 354 } 355 356 // If both items have a year and they're off by more than one, it's not a dupe 357 if (typeof yearCache[a.itemID] != 'undefined' 358 && typeof yearCache[b.itemID] != 'undefined' 359 && Math.abs(yearCache[a.itemID] - yearCache[b.itemID]) > 1) { 360 return 0; 361 } 362 363 // Check for at least one match on last name + first initial of first name 364 var aCreatorRows, bCreatorRows; 365 if (typeof creatorRowsCache[a.itemID] != 'undefined') { 366 aCreatorRows = creatorRowsCache[a.itemID]; 367 } 368 if (typeof creatorRowsCache[b.itemID] != 'undefined') { 369 bCreatorRows = creatorRowsCache[b.itemID]; 370 } 371 372 // Match if no creators 373 if (!aCreatorRows && !bCreatorRows) { 374 return 1; 375 } 376 377 if (!aCreatorRows || !bCreatorRows) { 378 return 0; 379 } 380 381 for (let i = 0; i < aCreatorRows.length; i++) { 382 let aCreatorRow = aCreatorRows[i]; 383 let aLastName = aCreatorRow.lastName; 384 let aFirstInitial = aCreatorRow.firstInitial; 385 386 for (let j = 0; j < bCreatorRows.length; j++) { 387 let bCreatorRow = bCreatorRows[j]; 388 let bLastName = bCreatorRow.lastName; 389 let bFirstInitial = bCreatorRow.firstInitial; 390 391 if (aLastName === bLastName && aFirstInitial === bFirstInitial) { 392 return 1; 393 } 394 } 395 } 396 397 return 0; 398 }, true); 399 } 400 401 // Match on exact fields 402 /*var fields = ['']; 403 for (let field of fields) { 404 var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) " 405 + "JOIN itemDataValues USING (valueID) " 406 + "WHERE libraryID=? AND fieldID=? " 407 + "AND itemID NOT IN (SELECT itemID FROM deletedItems) " 408 + "ORDER BY value"; 409 var rows = yield Zotero.DB.queryAsync(sql, [this._libraryID, Zotero.ItemFields.getID(field)]); 410 processRows(rows); 411 }*/ 412 413 Zotero.debug("Found duplicates in " + (Date.now() - start) + " ms"); 414 }); 415 416 417 418 /** 419 * Implements the Disjoint Set data structure 420 * 421 * Based on pseudo-code from http://en.wikipedia.org/wiki/Disjoint-set_data_structure 422 * 423 * Objects passed should have .id properties that uniquely identify them 424 */ 425 426 Zotero.DisjointSetForest = function () { 427 this._objects = {}; 428 } 429 430 Zotero.DisjointSetForest.prototype.find = function (x) { 431 var id = x.id; 432 433 // If we've seen this object before, use the existing copy, 434 // which will have .parent and .rank properties 435 if (this._objects[id]) { 436 var obj = this._objects[id]; 437 } 438 // Otherwise initialize it as a new set 439 else { 440 this._makeSet(x); 441 this._objects[id] = x; 442 var obj = x; 443 } 444 445 if (obj.parent.id == obj.id) { 446 return obj; 447 } 448 else { 449 obj.parent = this.find(obj.parent); 450 return obj.parent; 451 } 452 } 453 454 455 Zotero.DisjointSetForest.prototype.union = function (x, y) { 456 var xRoot = this.find(x); 457 var yRoot = this.find(y); 458 459 // Already in same set 460 if (xRoot.id == yRoot.id) { 461 return; 462 } 463 464 if (xRoot.rank < yRoot.rank) { 465 xRoot.parent = yRoot; 466 } 467 else if (xRoot.rank > yRoot.rank) { 468 yRoot.parent = xRoot; 469 } 470 else { 471 yRoot.parent = xRoot; 472 xRoot.rank = xRoot.rank + 1; 473 } 474 } 475 476 477 Zotero.DisjointSetForest.prototype.sameSet = function (x, y) { 478 return this.find(x) == this.find(y); 479 } 480 481 482 Zotero.DisjointSetForest.prototype.findAll = function (asIDs) { 483 var objects = []; 484 for (let i in this._objects) { 485 let obj = this._objects[i]; 486 objects.push(asIDs ? obj.id : obj); 487 } 488 return objects; 489 } 490 491 492 Zotero.DisjointSetForest.prototype.findAllInSet = function (x, asIDs) { 493 var xRoot = this.find(x); 494 var objects = []; 495 for (let i in this._objects) { 496 let obj = this._objects[i]; 497 if (this.find(obj) == xRoot) { 498 objects.push(asIDs ? obj.id : obj); 499 } 500 } 501 return objects; 502 } 503 504 505 Zotero.DisjointSetForest.prototype._makeSet = function (x) { 506 x.parent = x; 507 x.rank = 0; 508 }