diff options
author | Helmut Grohne <helmut@subdivi.de> | 2014-07-22 08:56:42 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2014-07-22 08:56:42 +0200 |
commit | 04597f25729740406775a3dff528c9774c84efd5 (patch) | |
tree | fe905fc94afbdcfad60d5aaf88886a1f10f92a8c /update_sharing.py | |
parent | ba9ae116e0bbb25e2df327ba48c82472ccfa2690 (diff) | |
parent | d48c3c208ee6ba54225b3eb68ce5c9f3c894bfa4 (diff) | |
download | debian-dedup-04597f25729740406775a3dff528c9774c84efd5.tar.gz |
Merge branch master into multiarch
Resolve accumulated conflicts. In particular webapp.py gained a few
non-trivial ones, such as changes in InternalRedirect or usage of
contextlib.closing.
Conflicts:
schema.sql
webapp.py
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-x | update_sharing.py | 20 |
1 files changed, 13 insertions, 7 deletions
diff --git a/update_sharing.py b/update_sharing.py index 1ff0fd8..ca6890b 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -47,14 +47,20 @@ def main(db): readcur = db.cursor() readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): - cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", + cur.execute("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = ?;", (hashvalue,)) - rows = cur.fetchall() - print("processing hash %s with %d entries" % (hashvalue, len(rows))) - pkgdict = compute_pkgdict(rows) - cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", - [(row[1],) for row in rows]) - process_pkgdict(cur, pkgdict) + rowdict = dict() + for row in cur.fetchall(): + rowdict.setdefault(row[0], []).append(row[1:]) + for eqclass, rows in rowdict.items(): + if len(rows) < 2: + print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass)) + continue + print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows))) + pkgdict = compute_pkgdict(rows) + cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", + [(row[1],) for row in rows]) + process_pkgdict(cur, pkgdict) cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');") cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';") cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';") |