diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-03-09 18:43:47 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-03-09 18:43:47 +0100 |
commit | 5c0dcba3229b8c3e0faf42cf3e07cb82ee1369cd (patch) | |
tree | 08f4b4cd1aae470eb6bae21bd0a4859ecd91aee9 /update_sharing.py | |
parent | 423ceee0d0befc8755a9ae915d15e8d415d98159 (diff) | |
download | debian-dedup-5c0dcba3229b8c3e0faf42cf3e07cb82ee1369cd.tar.gz |
split content table to a hash table
In the old content table (package, filename, size) would be the same for
multiple hash functions. Now the schema represents that each file has
precisely one size, but multiple hashes.
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-x | update_sharing.py | 4 |
1 files changed, 2 insertions, 2 deletions
diff --git a/update_sharing.py b/update_sharing.py index 2ed532b..b45e40b 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -43,9 +43,9 @@ def main(): cur.execute("PRAGMA foreign_keys = ON;") cur.execute("DELETE FROM sharing;") readcur = db.cursor() - readcur.execute("SELECT hash FROM content GROUP BY hash HAVING count(*) > 1;") + readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): - cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;", + cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", (hashvalue,)) rows = cur.fetchall() print("processing hash %s with %d entries" % (hashvalue, len(rows))) |