diff options
author | Helmut Grohne <helmut@subdivi.de> | 2014-02-21 21:59:04 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2014-02-21 21:59:04 +0100 |
commit | 7389e4b00f6add611e8d6b318654056097d6d546 (patch) | |
tree | 60bfd704081c059ea442b27169c1098abab4d0f9 | |
parent | 332ac9eafb235443f163c606ced95dcbd615815e (diff) | |
download | debian-dedup-7389e4b00f6add611e8d6b318654056097d6d546.tar.gz |
update_sharing: weaken assumptions about db layout
Hash functions are partitioned into equivalence classes. We are
generally only interested in sharing among hash functions with the same
equivalence class, but the algorithm would compute any sharing. While
the current layout never produces the same hashes for functions in
difference equivalence classes (for different output length), that may
change in future.
Also allow hash functions, that belong to no equivalence class at all
(eqclass = NULL) as a means to add additional metadata to content
without computing any sharing for it.
-rwxr-xr-x | update_sharing.py | 20 |
1 files changed, 13 insertions, 7 deletions
diff --git a/update_sharing.py b/update_sharing.py index 1ff0fd8..ca6890b 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -47,14 +47,20 @@ def main(db): readcur = db.cursor() readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): - cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", + cur.execute("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = ?;", (hashvalue,)) - rows = cur.fetchall() - print("processing hash %s with %d entries" % (hashvalue, len(rows))) - pkgdict = compute_pkgdict(rows) - cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", - [(row[1],) for row in rows]) - process_pkgdict(cur, pkgdict) + rowdict = dict() + for row in cur.fetchall(): + rowdict.setdefault(row[0], []).append(row[1:]) + for eqclass, rows in rowdict.items(): + if len(rows) < 2: + print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass)) + continue + print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows))) + pkgdict = compute_pkgdict(rows) + cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", + [(row[1],) for row in rows]) + process_pkgdict(cur, pkgdict) cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');") cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';") cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';") |