update_sharing: weaken assumptions about db layout

Hash functions are partitioned into equivalence classes. We are generally only interested in sharing among hash functions with the same equivalence class, but the algorithm would compute any sharing. While the current layout never produces the same hashes for functions in difference equivalence classes (for different output length), that may change in future. Also allow hash functions, that belong to no equivalence class at all (eqclass = NULL) as a means to add additional metadata to content without computing any sharing for it.
author: Helmut Grohne <helmut@subdivi.de> 2014-02-21 21:59:04 +0100
committer: Helmut Grohne <helmut@subdivi.de> 2014-02-21 21:59:04 +0100
commit: 7389e4b00f6add611e8d6b318654056097d6d546 (patch)
tree: 60bfd704081c059ea442b27169c1098abab4d0f9
parent: 332ac9eafb235443f163c606ced95dcbd615815e (diff)
download: debian-dedup-7389e4b00f6add611e8d6b318654056097d6d546.tar.gz
1 files changed, 13 insertions, 7 deletions
diff --git a/update_sharing.py b/update_sharing.py
index 1ff0fd8..ca6890b 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -47,14 +47,20 @@ def main(db):
     readcur = db.cursor()
     readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
     for hashvalue, in fetchiter(readcur):
-        cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
+        cur.execute("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = ?;",
                     (hashvalue,))
-        rows = cur.fetchall()
-        print("processing hash %s with %d entries" % (hashvalue, len(rows)))
-        pkgdict = compute_pkgdict(rows)
-        cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
-                        [(row[1],) for row in rows])
-        process_pkgdict(cur, pkgdict)
+        rowdict = dict()
+        for row in cur.fetchall():
+            rowdict.setdefault(row[0], []).append(row[1:])
+        for eqclass, rows in rowdict.items():
+            if len(rows) < 2:
+                print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass))
+                continue
+            print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows)))
+            pkgdict = compute_pkgdict(rows)
+            cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
+                            [(row[1],) for row in rows])
+            process_pkgdict(cur, pkgdict)
     cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")
     cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")
     cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")
author	Helmut Grohne <helmut@subdivi.de>	2014-02-21 21:59:04 +0100
committer	Helmut Grohne <helmut@subdivi.de>	2014-02-21 21:59:04 +0100
commit	7389e4b00f6add611e8d6b318654056097d6d546 (patch)
tree	60bfd704081c059ea442b27169c1098abab4d0f9
parent	332ac9eafb235443f163c606ced95dcbd615815e (diff)
download	debian-dedup-7389e4b00f6add611e8d6b318654056097d6d546.tar.gz