diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-07-03 21:19:13 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-07-03 21:19:13 +0200 |
commit | 56d048320a358b2c417cdb2211b3640394a182e9 (patch) | |
tree | ef5c52619ff18c4f3391b4eb19301999de4c66e7 | |
parent | f2bd48d342518c11ec7deaeee5b437ac524514de (diff) | |
download | debian-dedup-56d048320a358b2c417cdb2211b3640394a182e9.tar.gz |
store hash values as sqlite BLOB
They were previously hex encoded, so this should cut the space consumed
by hashes in half. A first benchmark indicates that the savings in
database size are in the order of 30%.
-rwxr-xr-x | readyaml.py | 3 | ||||
-rw-r--r-- | schema.sql | 2 | ||||
-rwxr-xr-x | update_sharing.py | 3 | ||||
-rwxr-xr-x | webapp.py | 12 |
4 files changed, 14 insertions, 6 deletions
diff --git a/readyaml.py b/readyaml.py index e2f3bb3..1a7206d 100755 --- a/readyaml.py +++ b/readyaml.py @@ -2,6 +2,7 @@ """This tool reads a yaml file as generated by importpkg.py on stdin and updates the database with the contents.""" +import binascii import sqlite3 import sys @@ -39,7 +40,7 @@ def readyaml(db, stream): (package, entry["name"], entry["size"])) cid = cur.lastrowid cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);", - ((cid, func, hexhash) + ((cid, func, buffer(binascii.a2b_hex(hexhash))) for func, hexhash in entry["hashes"].items())) raise ValueError("missing commit block") @@ -1,6 +1,6 @@ CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT, source TEXT); CREATE TABLE content (id INTEGER PRIMARY KEY, package TEXT, filename TEXT, size INTEGER, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE); -CREATE TABLE hash (cid INTEGER, function TEXT, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE); +CREATE TABLE hash (cid INTEGER, function TEXT, hash BLOB, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE); CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE); CREATE INDEX content_package_index ON content (package); CREATE INDEX hash_cid_index ON hash (cid); diff --git a/update_sharing.py b/update_sharing.py index d2b357b..1ea3d28 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -1,5 +1,6 @@ #!/usr/bin/python +import binascii import sqlite3 from dedup.utils import fetchiter @@ -49,7 +50,7 @@ def main(): cur.execute("SELECT content.package, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", (hashvalue,)) rows = cur.fetchall() - print("processing hash %s with %d entries" % (hashvalue, len(rows))) + print("processing hash %s with %d entries" % (binascii.b2a_hex(hashvalue), len(rows))) pkgdict = compute_pkgdict(rows) cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", [(row[1],) for row in rows]) @@ -1,5 +1,6 @@ #!/usr/bin/python +import binascii import datetime import os.path import sqlite3 @@ -210,7 +211,7 @@ def generate_shared(rows): funcdict = dict() entry = dict(filename1=filename1, filename2=filename2, size1=size1, size2=size2, functions=funcdict) - funcdict[funccomb] = hashvalue + funcdict[funccomb] = binascii.b2a_hex(hashvalue) if entry: yield entry @@ -317,6 +318,7 @@ class Application(object): files.clear() cursize = size + hashvalue = binascii.b2a_hex(hashvalue) if hashvalue in files: files[hashvalue]["filenames"].add(filename) continue @@ -329,7 +331,7 @@ class Application(object): (cid, package2)) for func1, hashvalue, func2, filename in fetchiter(cur2): entry["matches"].setdefault(filename, {})[func1, func2] = \ - hashvalue + binascii.b2a_hex(hashvalue) cur2.close() cur.close() @@ -351,9 +353,13 @@ class Application(object): return html_response(detail_template.stream(params)) def show_hash(self, function, hashvalue): + try: + bhash = buffer(binascii.a2b_hex(hashvalue)) + except TypeError: + raise NotFound() cur = self.db.cursor() cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM content JOIN hash ON content.id = hash.cid WHERE hash = ?;", - (hashvalue,)) + (bhash,)) entries = [dict(package=package, filename=filename, size=size, function=otherfunc) for package, filename, size, otherfunc in fetchiter(cur) |