store hash values as sqlite BLOB

They were previously hex encoded, so this should cut the space consumed by hashes in half. A first benchmark indicates that the savings in database size are in the order of 30%.
author: Helmut Grohne <helmut@subdivi.de> 2013-07-03 21:19:13 +0200
committer: Helmut Grohne <helmut@subdivi.de> 2013-07-03 21:19:13 +0200
commit: 56d048320a358b2c417cdb2211b3640394a182e9 (patch)
tree: ef5c52619ff18c4f3391b4eb19301999de4c66e7
parent: f2bd48d342518c11ec7deaeee5b437ac524514de (diff)
download: debian-dedup-56d048320a358b2c417cdb2211b3640394a182e9.tar.gz
4 files changed, 14 insertions, 6 deletions
diff --git a/readyaml.py b/readyaml.py
index e2f3bb3..1a7206d 100755
--- a/readyaml.py
+++ b/readyaml.py
@@ -2,6 +2,7 @@
 """This tool reads a yaml file as generated by importpkg.py on stdin and
 updates the database with the contents."""
 
+import binascii
 import sqlite3
 import sys
 
@@ -39,7 +40,7 @@ def readyaml(db, stream):
                     (package, entry["name"], entry["size"]))
         cid = cur.lastrowid
         cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);",
-                        ((cid, func, hexhash)
+                        ((cid, func, buffer(binascii.a2b_hex(hexhash)))
                          for func, hexhash in entry["hashes"].items()))
     raise ValueError("missing commit block")
 
diff --git a/schema.sql b/schema.sql
index e942c7b..30ed420 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,6 +1,6 @@
 CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT, source TEXT);
 CREATE TABLE content (id INTEGER PRIMARY KEY, package TEXT, filename TEXT, size INTEGER, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE);
-CREATE TABLE hash (cid INTEGER, function TEXT, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE);
+CREATE TABLE hash (cid INTEGER, function TEXT, hash BLOB, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE);
 CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE);
 CREATE INDEX content_package_index ON content (package);
 CREATE INDEX hash_cid_index ON hash (cid);
diff --git a/update_sharing.py b/update_sharing.py
index d2b357b..1ea3d28 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python
 
+import binascii
 import sqlite3
 
 from dedup.utils import fetchiter
@@ -49,7 +50,7 @@ def main():
         cur.execute("SELECT content.package, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
                     (hashvalue,))
         rows = cur.fetchall()
-        print("processing hash %s with %d entries" % (hashvalue, len(rows)))
+        print("processing hash %s with %d entries" % (binascii.b2a_hex(hashvalue), len(rows)))
         pkgdict = compute_pkgdict(rows)
         cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
                         [(row[1],) for row in rows])
diff --git a/webapp.py b/webapp.py
index 86d14f0..3820cf9 100755
--- a/webapp.py
+++ b/webapp.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python
 
+import binascii
 import datetime
 import os.path
 import sqlite3
@@ -210,7 +211,7 @@ def generate_shared(rows):
             funcdict = dict()
             entry = dict(filename1=filename1, filename2=filename2, size1=size1,
                          size2=size2, functions=funcdict)
-        funcdict[funccomb] = hashvalue
+        funcdict[funccomb] = binascii.b2a_hex(hashvalue)
     if entry:
         yield entry
 
@@ -317,6 +318,7 @@ class Application(object):
                 files.clear()
                 cursize = size
 
+            hashvalue = binascii.b2a_hex(hashvalue)
             if hashvalue in files:
                 files[hashvalue]["filenames"].add(filename)
                 continue
@@ -329,7 +331,7 @@ class Application(object):
                          (cid, package2))
             for func1, hashvalue, func2, filename in fetchiter(cur2):
                 entry["matches"].setdefault(filename, {})[func1, func2] = \
-                        hashvalue
+                        binascii.b2a_hex(hashvalue)
             cur2.close()
         cur.close()
 
@@ -351,9 +353,13 @@ class Application(object):
         return html_response(detail_template.stream(params))
 
     def show_hash(self, function, hashvalue):
+        try:
+            bhash = buffer(binascii.a2b_hex(hashvalue))
+        except TypeError:
+            raise NotFound()
         cur = self.db.cursor()
         cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM content JOIN hash ON content.id = hash.cid WHERE hash = ?;",
-                    (hashvalue,))
+                    (bhash,))
         entries = [dict(package=package, filename=filename, size=size,
                         function=otherfunc)
                    for package, filename, size, otherfunc in fetchiter(cur)
author	Helmut Grohne <helmut@subdivi.de>	2013-07-03 21:19:13 +0200
committer	Helmut Grohne <helmut@subdivi.de>	2013-07-03 21:19:13 +0200
commit	56d048320a358b2c417cdb2211b3640394a182e9 (patch)
tree	ef5c52619ff18c4f3391b4eb19301999de4c66e7
parent	f2bd48d342518c11ec7deaeee5b437ac524514de (diff)
download	debian-dedup-56d048320a358b2c417cdb2211b3640394a182e9.tar.gz