summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-07-03 21:19:13 +0200
committerHelmut Grohne <helmut@subdivi.de>2013-07-03 21:19:13 +0200
commit56d048320a358b2c417cdb2211b3640394a182e9 (patch)
treeef5c52619ff18c4f3391b4eb19301999de4c66e7
parentf2bd48d342518c11ec7deaeee5b437ac524514de (diff)
downloaddebian-dedup-56d048320a358b2c417cdb2211b3640394a182e9.tar.gz
store hash values as sqlite BLOB
They were previously hex encoded, so this should cut the space consumed by hashes in half. A first benchmark indicates that the savings in database size are in the order of 30%.
-rwxr-xr-xreadyaml.py3
-rw-r--r--schema.sql2
-rwxr-xr-xupdate_sharing.py3
-rwxr-xr-xwebapp.py12
4 files changed, 14 insertions, 6 deletions
diff --git a/readyaml.py b/readyaml.py
index e2f3bb3..1a7206d 100755
--- a/readyaml.py
+++ b/readyaml.py
@@ -2,6 +2,7 @@
"""This tool reads a yaml file as generated by importpkg.py on stdin and
updates the database with the contents."""
+import binascii
import sqlite3
import sys
@@ -39,7 +40,7 @@ def readyaml(db, stream):
(package, entry["name"], entry["size"]))
cid = cur.lastrowid
cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);",
- ((cid, func, hexhash)
+ ((cid, func, buffer(binascii.a2b_hex(hexhash)))
for func, hexhash in entry["hashes"].items()))
raise ValueError("missing commit block")
diff --git a/schema.sql b/schema.sql
index e942c7b..30ed420 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,6 +1,6 @@
CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT, source TEXT);
CREATE TABLE content (id INTEGER PRIMARY KEY, package TEXT, filename TEXT, size INTEGER, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE);
-CREATE TABLE hash (cid INTEGER, function TEXT, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE);
+CREATE TABLE hash (cid INTEGER, function TEXT, hash BLOB, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE);
CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE);
CREATE INDEX content_package_index ON content (package);
CREATE INDEX hash_cid_index ON hash (cid);
diff --git a/update_sharing.py b/update_sharing.py
index d2b357b..1ea3d28 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -1,5 +1,6 @@
#!/usr/bin/python
+import binascii
import sqlite3
from dedup.utils import fetchiter
@@ -49,7 +50,7 @@ def main():
cur.execute("SELECT content.package, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
(hashvalue,))
rows = cur.fetchall()
- print("processing hash %s with %d entries" % (hashvalue, len(rows)))
+ print("processing hash %s with %d entries" % (binascii.b2a_hex(hashvalue), len(rows)))
pkgdict = compute_pkgdict(rows)
cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
[(row[1],) for row in rows])
diff --git a/webapp.py b/webapp.py
index 86d14f0..3820cf9 100755
--- a/webapp.py
+++ b/webapp.py
@@ -1,5 +1,6 @@
#!/usr/bin/python
+import binascii
import datetime
import os.path
import sqlite3
@@ -210,7 +211,7 @@ def generate_shared(rows):
funcdict = dict()
entry = dict(filename1=filename1, filename2=filename2, size1=size1,
size2=size2, functions=funcdict)
- funcdict[funccomb] = hashvalue
+ funcdict[funccomb] = binascii.b2a_hex(hashvalue)
if entry:
yield entry
@@ -317,6 +318,7 @@ class Application(object):
files.clear()
cursize = size
+ hashvalue = binascii.b2a_hex(hashvalue)
if hashvalue in files:
files[hashvalue]["filenames"].add(filename)
continue
@@ -329,7 +331,7 @@ class Application(object):
(cid, package2))
for func1, hashvalue, func2, filename in fetchiter(cur2):
entry["matches"].setdefault(filename, {})[func1, func2] = \
- hashvalue
+ binascii.b2a_hex(hashvalue)
cur2.close()
cur.close()
@@ -351,9 +353,13 @@ class Application(object):
return html_response(detail_template.stream(params))
def show_hash(self, function, hashvalue):
+ try:
+ bhash = buffer(binascii.a2b_hex(hashvalue))
+ except TypeError:
+ raise NotFound()
cur = self.db.cursor()
cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM content JOIN hash ON content.id = hash.cid WHERE hash = ?;",
- (hashvalue,))
+ (bhash,))
entries = [dict(package=package, filename=filename, size=size,
function=otherfunc)
for package, filename, size, otherfunc in fetchiter(cur)