diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-07-23 23:23:41 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-07-23 23:23:41 +0200 |
commit | eaba84e444c77495a5654b600c599646b8aa1aed (patch) | |
tree | ff6bc8bb15de0c3669e2a6a6ad159b39dd638594 /update_sharing.py | |
parent | 6206dea43941560a29c9a1105ae3055740ab80aa (diff) | |
download | debian-dedup-eaba84e444c77495a5654b600c599646b8aa1aed.tar.gz |
schema: identify hash values by an integerhashid
This one is a bit more complex, than the other transformations, because
the new hashvalue table has to be cleaned with a trigger. During a test
import the -wal file exploded. The resulting db is similar in size to
the original.
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-x | update_sharing.py | 10 |
1 files changed, 5 insertions, 5 deletions
diff --git a/update_sharing.py b/update_sharing.py index 55e8096..bca0213 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -44,12 +44,12 @@ def main(): cur.execute("DELETE FROM sharing;") cur.execute("DELETE FROM duplicate;") readcur = db.cursor() - readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") - for hashvalue, in fetchiter(readcur): - cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", - (hashvalue,)) + readcur.execute("SELECT hid FROM hash GROUP BY hid HAVING count(*) > 1;") + for hid, in fetchiter(readcur): + cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hid = ?;", + (hid,)) rows = cur.fetchall() - print("processing hash %s with %d entries" % (hashvalue, len(rows))) + print("processing hash %d with %d entries" % (hid, len(rows))) pkgdict = compute_pkgdict(rows) cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", [(row[1],) for row in rows]) |