diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-07-03 21:19:13 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-07-03 21:19:13 +0200 |
commit | 56d048320a358b2c417cdb2211b3640394a182e9 (patch) | |
tree | ef5c52619ff18c4f3391b4eb19301999de4c66e7 /update_sharing.py | |
parent | f2bd48d342518c11ec7deaeee5b437ac524514de (diff) | |
download | debian-dedup-56d048320a358b2c417cdb2211b3640394a182e9.tar.gz |
store hash values as sqlite BLOB
They were previously hex encoded, so this should cut the space consumed
by hashes in half. A first benchmark indicates that the savings in
database size are in the order of 30%.
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-x | update_sharing.py | 3 |
1 files changed, 2 insertions, 1 deletions
diff --git a/update_sharing.py b/update_sharing.py index d2b357b..1ea3d28 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -1,5 +1,6 @@ #!/usr/bin/python +import binascii import sqlite3 from dedup.utils import fetchiter @@ -49,7 +50,7 @@ def main(): cur.execute("SELECT content.package, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", (hashvalue,)) rows = cur.fetchall() - print("processing hash %s with %d entries" % (hashvalue, len(rows))) + print("processing hash %s with %d entries" % (binascii.b2a_hex(hashvalue), len(rows))) pkgdict = compute_pkgdict(rows) cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", [(row[1],) for row in rows]) |