www

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | Submodules | README | LICENSE

duplicates.js (14139B)


      1 /*
      2     ***** BEGIN LICENSE BLOCK *****
      3     
      4     Copyright © 2009 Center for History and New Media
      5                      George Mason University, Fairfax, Virginia, USA
      6                      http://zotero.org
      7     
      8     This file is part of Zotero.
      9     
     10     Zotero is free software: you can redistribute it and/or modify
     11     it under the terms of the GNU Affero General Public License as published by
     12     the Free Software Foundation, either version 3 of the License, or
     13     (at your option) any later version.
     14     
     15     Zotero is distributed in the hope that it will be useful,
     16     but WITHOUT ANY WARRANTY; without even the implied warranty of
     17     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     18     GNU Affero General Public License for more details.
     19     
     20     You should have received a copy of the GNU Affero General Public License
     21     along with Zotero.  If not, see <http://www.gnu.org/licenses/>.
     22     
     23     ***** END LICENSE BLOCK *****
     24 */
     25 
     26 Zotero.Duplicates = function (libraryID) {
     27 	if (typeof libraryID == 'undefined') {
     28 		throw ("libraryID not provided in Zotero.Duplicates constructor");
     29 	}
     30 	
     31 	if (!libraryID) {
     32 		libraryID = Zotero.Libraries.userLibraryID;
     33 	}
     34 	
     35 	this._libraryID = libraryID;
     36 }
     37 
     38 
     39 Zotero.Duplicates.prototype.__defineGetter__('name', function () { return Zotero.getString('pane.collections.duplicate'); });
     40 Zotero.Duplicates.prototype.__defineGetter__('libraryID', function () { return this._libraryID; });
     41 
     42 /**
     43  * Get duplicates, populate a temporary table, and return a search based
     44  * on that table
     45  *
     46  * @return {Zotero.Search}
     47  */
     48 Zotero.Duplicates.prototype.getSearchObject = async function () {
     49 	var table = 'tmpDuplicates_' + Zotero.Utilities.randomString();
     50 	
     51 	await this._findDuplicates();
     52 	var ids = this._sets.findAll(true);
     53 	
     54 	// Zotero.CollectionTreeRow::getSearchObject() extracts the table name and creates an
     55 	// unload listener that drops the table when the ItemTreeView is unregistered
     56 	var sql = `CREATE TEMPORARY TABLE ${table} (id INTEGER PRIMARY KEY)`;
     57 	await Zotero.DB.queryAsync(sql);
     58 	
     59 	if (ids.length) {
     60 		Zotero.debug("Inserting rows into temp table");
     61 		sql = `INSERT INTO ${table} VALUES `;
     62 		await Zotero.Utilities.Internal.forEachChunkAsync(
     63 			ids,
     64 			Zotero.DB.MAX_BOUND_PARAMETERS,
     65 			async function (chunk) {
     66 				let idStr = '(' + chunk.join('), (') + ')';
     67 				await Zotero.DB.queryAsync(sql + idStr, false, { debug: false });
     68 			}
     69 		);
     70 		Zotero.debug("Done");
     71 	}
     72 	else {
     73 		Zotero.debug("No duplicates found");
     74 	}
     75 	
     76 	var s = new Zotero.Search;
     77 	s.libraryID = this._libraryID;
     78 	s.addCondition('tempTable', 'is', table);
     79 	return s;
     80 };
     81 
     82 
     83 /**
     84  * Finds all items in the same set as a given item
     85  *
     86  * @param {Integer} itemID
     87  * @return {Integer[]}  Array of itemIDs
     88  */
     89 Zotero.Duplicates.prototype.getSetItemsByItemID = function (itemID) {
     90 	return this._sets.findAllInSet(this._getObjectFromID(itemID), true);
     91 }
     92 
     93 
     94 Zotero.Duplicates.prototype._getObjectFromID = function (id) {
     95 	return {
     96 		get id() { return id; }
     97 	}
     98 }
     99 
    100 
    101 Zotero.Duplicates.prototype._findDuplicates = Zotero.Promise.coroutine(function* () {
    102 	Zotero.debug("Finding duplicates");
    103 	
    104 	var start = Date.now();
    105 	
    106 	var self = this;
    107 	
    108 	this._sets = new Zotero.DisjointSetForest;
    109 	var sets = this._sets;
    110 	
    111 	function normalizeString(str) {
    112 		// Make sure we have a string and not an integer
    113 		str = str + "";
    114 		
    115 		if (str === "") {
    116 			return "";
    117 		}
    118 		
    119 		str = Zotero.Utilities.removeDiacritics(str)
    120 			.replace(/[ !-/:-@[-`{-~]+/g, ' ') // Convert (ASCII) punctuation to spaces
    121 			.trim()
    122 			.toLowerCase();
    123 		
    124 		return str;
    125 	}
    126 	
    127 	function sortByValue(a, b) {
    128 		if((a.value === null && b.value !== null)
    129 			|| (a.value === undefined && b.value !== undefined)
    130 			|| a.value < b.value) {
    131 			return -1;
    132 		}
    133 		
    134 		if(a.value === b.value) return 0;
    135 		
    136 		return 1;
    137 	}
    138 	
    139 	/**
    140 	 * @param {Function} compareRows  Comparison function, if not exact match
    141 	 * @param {Boolean} reprocessMatches  Compare every row against every other,
    142 	 *                                    without skipping ahead to the last match.
    143 	 *                                    This is necessary for multi-dimensional
    144 	 *                                    matches such as title + at least one creator.
    145 	 *                                    Without it, only one set of matches would be
    146 	 *                                    found per matching title, since items with
    147 	 *                                    different creators wouldn't match the first
    148 	 *                                    set and the next start row would be a
    149 	 *                                    different title.
    150 	 */
    151 	function processRows(rows, compareRows, reprocessMatches) {
    152 		if (!rows.length) {
    153 			return;
    154 		}
    155 		
    156 		for (var i = 0, len = rows.length; i < len; i++) {
    157 			var j = i + 1, lastMatch = false;
    158 			while (j < len) {
    159 				if (compareRows) {
    160 					var match = compareRows(rows[i], rows[j]);
    161 					// Not a match, and don't try any more with this i value
    162 					if (match == -1) {
    163 						break;
    164 					}
    165 					// Not a match, but keep looking
    166 					if (match == 0) {
    167 						j++;
    168 						continue;
    169 					}
    170 				}
    171 				// If no comparison function, check for exact match
    172 				else {
    173 					if (!rows[i].value || !rows[j].value
    174 						|| (rows[i].value !== rows[j].value)
    175 					) {
    176 						break;
    177 					}
    178 				}
    179 				
    180 				sets.union(
    181 					self._getObjectFromID(rows[i].itemID),
    182 					self._getObjectFromID(rows[j].itemID)
    183 				);
    184 				
    185 				lastMatch = j;
    186 				j++;
    187 			}
    188 			if (!reprocessMatches && lastMatch) {
    189 				i = lastMatch;
    190 			}
    191 		}
    192 	}
    193 	
    194 	// Match books by ISBN
    195 	var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) "
    196 				+ "JOIN itemDataValues USING (valueID) "
    197 				+ "WHERE libraryID=? AND itemTypeID=? AND fieldID=? "
    198 				+ "AND itemID NOT IN (SELECT itemID FROM deletedItems)";
    199 	var rows = yield Zotero.DB.queryAsync(
    200 		sql,
    201 		[
    202 			this._libraryID,
    203 			Zotero.ItemTypes.getID('book'),
    204 			Zotero.ItemFields.getID('ISBN')
    205 		]
    206 	);
    207 	var isbnCache = {};
    208 	if (rows.length) {
    209 		let newRows = [];
    210 		for (let i = 0; i < rows.length; i++) {
    211 			let row = rows[i];
    212 			let newVal = Zotero.Utilities.cleanISBN('' + row.value);
    213 			if (!newVal) continue;
    214 			isbnCache[row.itemID] = newVal;
    215 			newRows.push({
    216 				itemID: row.itemID,
    217 				value: newVal
    218 			});
    219 		}
    220 		newRows.sort(sortByValue);
    221 		processRows(newRows);
    222 	}
    223 	
    224 	// DOI
    225 	var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) "
    226 				+ "JOIN itemDataValues USING (valueID) "
    227 				+ "WHERE libraryID=? AND fieldID=? AND value LIKE ? "
    228 				+ "AND itemID NOT IN (SELECT itemID FROM deletedItems)";
    229 	var rows = yield Zotero.DB.queryAsync(
    230 		sql,
    231 		[
    232 			this._libraryID,
    233 			Zotero.ItemFields.getID('DOI'),
    234 			'10.%'
    235 		]
    236 	);
    237 	var doiCache = {};
    238 	if (rows.length) {
    239 		let newRows = [];
    240 		for (let i = 0; i < rows.length; i++) {
    241 			let row = rows[i];
    242 			// DOIs are case insensitive
    243 			let newVal = (row.value + '').trim().toUpperCase();
    244 			doiCache[row.itemID] = newVal;
    245 			newRows.push({
    246 				itemID: row.itemID,
    247 				value: newVal
    248 			});
    249 		}
    250 		newRows.sort(sortByValue);
    251 		processRows(newRows);
    252 	}
    253 	
    254 	// Get years
    255 	var dateFields = [Zotero.ItemFields.getID('date')].concat(
    256 		Zotero.ItemFields.getTypeFieldsFromBase('date')
    257 	);
    258 	var sql = "SELECT itemID, SUBSTR(value, 1, 4) AS year FROM items "
    259 				+ "JOIN itemData USING (itemID) "
    260 				+ "JOIN itemDataValues USING (valueID) "
    261 				+ "WHERE libraryID=? AND fieldID IN ("
    262 				+ dateFields.map(() => '?').join() + ") "
    263 				+ "AND SUBSTR(value, 1, 4) != '0000' "
    264 				+ "AND itemID NOT IN (SELECT itemID FROM deletedItems) "
    265 				+ "ORDER BY value";
    266 	var rows = yield Zotero.DB.queryAsync(sql, [this._libraryID].concat(dateFields));
    267 	var yearCache = {};
    268 	for (let i = 0; i < rows.length; i++) {
    269 		let row = rows[i];
    270 		yearCache[row.itemID] = row.year;
    271 	}
    272 	
    273 	// Match on normalized title
    274 	var titleIDs = Zotero.ItemFields.getTypeFieldsFromBase('title');
    275 	titleIDs.push(Zotero.ItemFields.getID('title'));
    276 	var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) "
    277 				+ "JOIN itemDataValues USING (valueID) "
    278 				+ "WHERE libraryID=? AND fieldID IN "
    279 				+ "(" + titleIDs.join(', ') + ") "
    280 				+ "AND itemTypeID NOT IN (1, 14) "
    281 				+ "AND itemID NOT IN (SELECT itemID FROM deletedItems)";
    282 	var rows = yield Zotero.DB.queryAsync(sql, [this._libraryID]);
    283 	if (rows.length) {
    284 		//normalize all values ahead of time
    285 		rows = rows.map(function(row) {
    286 			return {
    287 				itemID: row.itemID,
    288 				value: normalizeString(row.value)
    289 			};
    290 		});
    291 		//sort rows by normalized values
    292 		rows.sort(sortByValue);
    293 		
    294 		// Get all creators and separate by itemID
    295 		//
    296 		// We won't need all of these, but otherwise we would have to make processRows()
    297 		// asynchronous, which would be too slow
    298 		let creatorRowsCache = {};
    299 		let sql = "SELECT itemID, lastName, firstName, fieldMode FROM items "
    300 			+ "JOIN itemCreators USING (itemID) "
    301 			+ "JOIN creators USING (creatorID) "
    302 			+ "WHERE libraryID=? AND itemTypeID NOT IN (1, 14) AND "
    303 			+ "itemID NOT IN (SELECT itemID FROM deletedItems)"
    304 			+ "ORDER BY itemID, orderIndex";
    305 		let creatorRows = yield Zotero.DB.queryAsync(sql, this._libraryID);
    306 		let lastItemID;
    307 		let itemCreators = [];
    308 		for (let i = 0; i < creatorRows.length; i++) {
    309 			let row = creatorRows[i];
    310 			if (lastItemID && row.itemID != lastItemID) {
    311 				if (itemCreators.length) {
    312 					creatorRowsCache[lastItemID] = itemCreators;
    313 					itemCreators = [];
    314 				}
    315 			}
    316 			
    317 			lastItemID = row.itemID;
    318 			
    319 			itemCreators.push({
    320 				lastName: normalizeString(row.lastName),
    321 				firstInitial: row.fieldMode == 0 ? normalizeString(row.firstName).charAt(0) : false
    322 			});
    323 		}
    324 		// Add final item creators
    325 		if (itemCreators.length) {
    326 			creatorRowsCache[lastItemID] = itemCreators;
    327 		}
    328 		
    329 		processRows(rows, function (a, b) {
    330 			var aTitle = a.value;
    331 			var bTitle = b.value;
    332 			
    333 			// If we stripped one of the strings completely, we can't compare them
    334 			if(!aTitle || !bTitle) {
    335 				return -1;
    336 			}
    337 			
    338 			if (aTitle !== bTitle) {
    339 				return -1;	//everything is sorted by title, so if this mismatches, everything following will too
    340 			}
    341 			
    342 			// If both items have a DOI and they don't match, it's not a dupe
    343 			if (typeof doiCache[a.itemID] != 'undefined'
    344 					&& typeof doiCache[b.itemID] != 'undefined'
    345 					&& doiCache[a.itemID] != doiCache[b.itemID]) {
    346 				return 0;
    347 			}
    348 			
    349 			// If both items have an ISBN and they don't match, it's not a dupe
    350 			if (typeof isbnCache[a.itemID] != 'undefined'
    351 					&& typeof isbnCache[b.itemID] != 'undefined'
    352 					&& isbnCache[a.itemID] != isbnCache[b.itemID]) {
    353 				return 0;
    354 			}
    355 			
    356 			// If both items have a year and they're off by more than one, it's not a dupe
    357 			if (typeof yearCache[a.itemID] != 'undefined'
    358 					&& typeof yearCache[b.itemID] != 'undefined'
    359 					&& Math.abs(yearCache[a.itemID] - yearCache[b.itemID]) > 1) {
    360 				return 0;
    361 			}
    362 			
    363 			// Check for at least one match on last name + first initial of first name
    364 			var aCreatorRows, bCreatorRows;
    365 			if (typeof creatorRowsCache[a.itemID] != 'undefined') {
    366 				aCreatorRows = creatorRowsCache[a.itemID];
    367 			}
    368 			if (typeof creatorRowsCache[b.itemID] != 'undefined') {
    369 				bCreatorRows = creatorRowsCache[b.itemID];
    370 			}
    371 			
    372 			// Match if no creators
    373 			if (!aCreatorRows && !bCreatorRows) {
    374 				return 1;
    375 			}
    376 			
    377 			if (!aCreatorRows || !bCreatorRows) {
    378 				return 0;
    379 			}
    380 			
    381 			for (let i = 0; i < aCreatorRows.length; i++) {
    382 				let aCreatorRow = aCreatorRows[i];
    383 				let aLastName = aCreatorRow.lastName;
    384 				let aFirstInitial = aCreatorRow.firstInitial;
    385 				
    386 				for (let j = 0; j < bCreatorRows.length; j++) {
    387 					let bCreatorRow = bCreatorRows[j];
    388 					let bLastName = bCreatorRow.lastName;
    389 					let bFirstInitial = bCreatorRow.firstInitial;
    390 					
    391 					if (aLastName === bLastName && aFirstInitial === bFirstInitial) {
    392 						return 1;
    393 					}
    394 				}
    395 			}
    396 			
    397 			return 0;
    398 		}, true);
    399 	}
    400 	
    401 	// Match on exact fields
    402 	/*var fields = [''];
    403 	for (let field of fields) {
    404 		var sql = "SELECT itemID, value FROM items JOIN itemData USING (itemID) "
    405 					+ "JOIN itemDataValues USING (valueID) "
    406 					+ "WHERE libraryID=? AND fieldID=? "
    407 					+ "AND itemID NOT IN (SELECT itemID FROM deletedItems) "
    408 					+ "ORDER BY value";
    409 		var rows = yield Zotero.DB.queryAsync(sql, [this._libraryID, Zotero.ItemFields.getID(field)]);
    410 		processRows(rows);
    411 	}*/
    412 	
    413 	Zotero.debug("Found duplicates in " + (Date.now() - start) + " ms");
    414 });
    415 
    416 
    417 
    418 /**
    419  * Implements the Disjoint Set data structure
    420  *
    421  *  Based on pseudo-code from http://en.wikipedia.org/wiki/Disjoint-set_data_structure
    422  *
    423  * Objects passed should have .id properties that uniquely identify them
    424  */
    425 
    426 Zotero.DisjointSetForest = function () {
    427 	this._objects = {};
    428 }
    429 
    430 Zotero.DisjointSetForest.prototype.find = function (x) {
    431 	var id = x.id;
    432 	
    433 	// If we've seen this object before, use the existing copy,
    434 	// which will have .parent and .rank properties
    435 	if (this._objects[id]) {
    436 		var obj = this._objects[id];
    437 	}
    438 	// Otherwise initialize it as a new set
    439 	else {
    440 		this._makeSet(x);
    441 		this._objects[id] = x;
    442 		var obj = x;
    443 	}
    444 	
    445 	if (obj.parent.id == obj.id) {
    446 		return obj;
    447 	}
    448 	else {
    449 		obj.parent = this.find(obj.parent);
    450 		return obj.parent;
    451 	}
    452 }
    453 
    454 
    455 Zotero.DisjointSetForest.prototype.union = function (x, y) {
    456 	var xRoot = this.find(x);
    457 	var yRoot = this.find(y);
    458 	
    459 	// Already in same set
    460 	if (xRoot.id == yRoot.id) {
    461 		return;
    462 	}
    463 	
    464 	if (xRoot.rank < yRoot.rank) {
    465 		xRoot.parent = yRoot;
    466 	}
    467 	else if (xRoot.rank > yRoot.rank) {
    468 		yRoot.parent = xRoot;
    469 	}
    470 	else {
    471 		yRoot.parent = xRoot;
    472 		xRoot.rank = xRoot.rank + 1;
    473 	}
    474 }
    475 
    476 
    477 Zotero.DisjointSetForest.prototype.sameSet = function (x, y) {
    478     return this.find(x) == this.find(y);
    479 }
    480 
    481 
    482 Zotero.DisjointSetForest.prototype.findAll = function (asIDs) {
    483 	var objects = [];
    484 	for (let i in this._objects) {
    485 		let obj = this._objects[i];
    486 		objects.push(asIDs ? obj.id : obj);
    487 	}
    488 	return objects;
    489 }
    490 
    491 
    492 Zotero.DisjointSetForest.prototype.findAllInSet = function (x, asIDs) {
    493 	var xRoot = this.find(x);
    494 	var objects = [];
    495 	for (let i in this._objects) {
    496 		let obj = this._objects[i];
    497 		if (this.find(obj) == xRoot) {
    498 			objects.push(asIDs ? obj.id : obj);
    499 		}
    500 	}
    501 	return objects;
    502 }
    503 
    504 
    505 Zotero.DisjointSetForest.prototype._makeSet = function (x) {
    506 	x.parent = x;
    507 	x.rank = 0;
    508 }