split content table to a hash table

In the old content table (package, filename, size) would be the same for multiple hash functions. Now the schema represents that each file has precisely one size, but multiple hashes.
author: Helmut Grohne <helmut@subdivi.de> 2013-03-09 18:43:47 +0100
committer: Helmut Grohne <helmut@subdivi.de> 2013-03-09 18:43:47 +0100
commit: 5c0dcba3229b8c3e0faf42cf3e07cb82ee1369cd (patch)
tree: 08f4b4cd1aae470eb6bae21bd0a4859ecd91aee9
parent: 423ceee0d0befc8755a9ae915d15e8d415d98159 (diff)
download: debian-dedup-5c0dcba3229b8c3e0faf42cf3e07cb82ee1369cd.tar.gz
4 files changed, 18 insertions, 11 deletions
diff --git a/importpkg.py b/importpkg.py
index aae9a7f..5d6a58c 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -102,10 +102,12 @@ def get_hashes(tar):
             continue
         hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
         hasher = hash_file(hasher, tar.extractfile(elem))
+        hashes = {}
         for hashobj in hasher.hashes:
             hashvalue = hashobj.hexdigest()
             if hashvalue:
-                yield (elem.name, elem.size, hashobj.name, hashvalue)
+                hashes[hashobj.name] = hashvalue
+        yield (elem.name, elem.size, hashes)
 
 def process_package(db, filelike):
     cur = db.cursor()
@@ -169,14 +171,17 @@ def process_package(db, filelike):
             continue
         if state != "control_file":
             raise ValueError("missing control file")
-        for name, size, function, hexhash in get_hashes(tf):
+        for name, size, hashes in get_hashes(tf):
             try:
                 name = name.decode("utf8")
             except UnicodeDecodeError:
                 print("warning: skipping filename with encoding error")
                 continue # skip files with non-utf8 encoding for now
-            cur.execute("INSERT INTO content (package, filename, size, function, hash) VALUES (?, ?, ?, ?, ?);",
-                        (package, name, size, function, hexhash))
+            cur.execute("INSERT INTO content (package, filename, size) VALUES (?, ?, ?);",
+                        (package, name, size))
+            cid = cur.lastrowid
+            cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);",
+                            ((cid, func, hexhash) for func, hexhash in hashes.items()))
         db.commit()
         return
     raise ValueError("data.tar not found")
diff --git a/schema.sql b/schema.sql
index c329465..a67c807 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,8 +1,10 @@
 CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT, source TEXT);
-CREATE TABLE content (package TEXT, filename TEXT, size INTEGER, function TEXT, hash TEXT, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE);
+CREATE TABLE content (id INTEGER PRIMARY KEY, package TEXT, filename TEXT, size INTEGER, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE);
+CREATE TABLE hash (cid INTEGER, function TEXT, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE);
 CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE);
 CREATE INDEX content_package_index ON content (package);
-CREATE INDEX content_hash_index ON content (hash);
+CREATE INDEX hash_cid_index ON hash (cid);
+CREATE INDEX hash_hash_index ON hash (hash);
 
 CREATE TABLE sharing (package1 TEXT, package2 TEXT, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER, FOREIGN KEY (package1) REFERENCES package(package) ON DELETE CASCADE, FOREIGN KEY (package2) REFERENCES package(package) ON DELETE CASCADE);
 CREATE INDEX sharing_insert_index ON sharing (package1, package2, func1, func2);
diff --git a/update_sharing.py b/update_sharing.py
index 2ed532b..b45e40b 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -43,9 +43,9 @@ def main():
     cur.execute("PRAGMA foreign_keys = ON;")
     cur.execute("DELETE FROM sharing;")
     readcur = db.cursor()
-    readcur.execute("SELECT hash FROM content GROUP BY hash HAVING count(*) > 1;")
+    readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
     for hashvalue, in fetchiter(readcur):
-        cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;",
+        cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
                     (hashvalue,))
         rows = cur.fetchall()
         print("processing hash %s with %d entries" % (hashvalue, len(rows)))
diff --git a/webapp.py b/webapp.py
index f80b3da..1da987b 100755
--- a/webapp.py
+++ b/webapp.py
@@ -279,13 +279,13 @@ class Application(object):
         if package1 == package2:
             details1 = details2 = self.get_details(package1)
 
-            cur.execute("SELECT a.filename, a.size, a.function, b.filename, b.size, b.function, a.hash FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND b.package = ? AND a.filename != b.filename ORDER BY a.size DESC, a.filename, b.filename;",
+            cur.execute("SELECT a.filename, a.size, ha.function, b.filename, b.size, hb.function, ha.hash FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE a.package = ? AND b.package = ? AND a.filename != b.filename ORDER BY a.size DESC, a.filename, b.filename;",
                         (package1, package1))
         else:
             details1 = self.get_details(package1)
             details2 = self.get_details(package2)
 
-            cur.execute("SELECT a.filename, a.size, a.function, b.filename, b.size, b.function, a.hash FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND b.package = ? ORDER BY a.size DESC, a.filename, b.filename;",
+            cur.execute("SELECT a.filename, a.size, ha.function, b.filename, b.size, hb.function, ha.hash FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE a.package = ? AND b.package = ? ORDER BY a.size DESC, a.filename, b.filename;",
                         (package1, package2))
         shared = generate_shared(fetchiter(cur))
         # The cursor will be in use until the template is fully rendered.
@@ -297,7 +297,7 @@ class Application(object):
 
     def show_hash(self, function, hashvalue):
         cur = self.db.cursor()
-        cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;",
+        cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM content JOIN hash ON content.id = hash.cid WHERE hash = ?;",
                     (hashvalue,))
         entries = [dict(package=package, filename=filename, size=size,
                         function=otherfunc)
author	Helmut Grohne <helmut@subdivi.de>	2013-03-09 18:43:47 +0100
committer	Helmut Grohne <helmut@subdivi.de>	2013-03-09 18:43:47 +0100
commit	5c0dcba3229b8c3e0faf42cf3e07cb82ee1369cd (patch)
tree	08f4b4cd1aae470eb6bae21bd0a4859ecd91aee9
parent	423ceee0d0befc8755a9ae915d15e8d415d98159 (diff)
download	debian-dedup-5c0dcba3229b8c3e0faf42cf3e07cb82ee1369cd.tar.gz