summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2014-02-23 20:12:18 +0100
committerHelmut Grohne <helmut@subdivi.de>2014-02-23 20:12:18 +0100
commitf29676904602fa9b0e0cf51ab0e7345ba28939db (patch)
tree33bab7fd2dcc1640ff53fc9eea1c38a59a19e0ab
parent8d4c5512edbdcdd1063a7e6508f398a5a57981be (diff)
parent7389e4b00f6add611e8d6b318654056097d6d546 (diff)
downloaddebian-dedup-f29676904602fa9b0e0cf51ab0e7345ba28939db.tar.gz
Merge branch updatesharing-eqclass
-rwxr-xr-xupdate_sharing.py20
1 files changed, 13 insertions, 7 deletions
diff --git a/update_sharing.py b/update_sharing.py
index 1ff0fd8..ca6890b 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -47,14 +47,20 @@ def main(db):
readcur = db.cursor()
readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
for hashvalue, in fetchiter(readcur):
- cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
+ cur.execute("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = ?;",
(hashvalue,))
- rows = cur.fetchall()
- print("processing hash %s with %d entries" % (hashvalue, len(rows)))
- pkgdict = compute_pkgdict(rows)
- cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
- [(row[1],) for row in rows])
- process_pkgdict(cur, pkgdict)
+ rowdict = dict()
+ for row in cur.fetchall():
+ rowdict.setdefault(row[0], []).append(row[1:])
+ for eqclass, rows in rowdict.items():
+ if len(rows) < 2:
+ print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass))
+ continue
+ print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows)))
+ pkgdict = compute_pkgdict(rows)
+ cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
+ [(row[1],) for row in rows])
+ process_pkgdict(cur, pkgdict)
cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")
cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")
cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")