add sharing table

The sharing table is a cache for the /binary web pages. It essentially contains the numbers presented. This caching table is not automatically populated. It needs to be reconstructed after every (group of) package imports.
author: Helmut Grohne <helmut@subdivi.de> 2013-03-02 22:29:04 +0100
committer: Helmut Grohne <helmut@subdivi.de> 2013-03-02 22:29:04 +0100
commit: e296fa30b78d8d52aba74c4a069bc7e061e96dd2 (patch)
tree: c094c1b74601e2d2cb742f8a4ce6e045d60c3f93
parent: c84e9e795a2a31a3e011e13a393b1278b0feb658 (diff)
download: debian-dedup-e296fa30b78d8d52aba74c4a069bc7e061e96dd2.tar.gz
2 files changed, 60 insertions, 0 deletions
diff --git a/schema.sql b/schema.sql
index 3f3a08e..b5397ee 100644
--- a/schema.sql
+++ b/schema.sql
@@ -3,3 +3,7 @@ CREATE TABLE content (package TEXT, filename TEXT, size INTEGER, function TEXT,
 CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package), FOREIGN KEY (required) REFERENCES package(package));
 CREATE INDEX content_package_index ON content (package);
 CREATE INDEX content_hash_index ON content (hash);
+
+CREATE TABLE sharing (package1 TEXT, package2 TEXT, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER);
+CREATE INDEX sharing_insert_index ON sharing (package1, package2, func1, func2);
+CREATE INDEX sharing_package_index ON sharing (package1);
diff --git a/update_sharing.py b/update_sharing.py
new file mode 100755
index 0000000..92da945
--- /dev/null
+++ b/update_sharing.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python
+
+import sqlite3
+
+from dedup.utils import fetchiter
+
+def add_values(cursor, insert_key, files, size):
+    cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE package1 = ? AND package2 = ? AND func1 = ? AND func2 = ?;",
+                   (files, size) + insert_key)
+    if cursor.rowcount > 0:
+        return
+    cursor.execute("INSERT INTO sharing (package1, package2, func1, func2, files, size) VALUES (?, ?, ?, ?, ?, ?);",
+                   insert_key + (files, size))
+
+def compute_pkgdict(rows):
+    pkgdict = dict()
+    for package, filename, size, function in rows:
+        funcdict = pkgdict.setdefault(package, {})
+        funcdict.setdefault(function, []).append((size, filename))
+    return pkgdict
+
+def process_pkgdict(cursor, pkgdict):
+    for package1, funcdict1 in pkgdict.items():
+        for function1, files in funcdict1.items():
+            numfiles = len(files)
+            size = sum(entry[0] for entry in files)
+            for package2, funcdict2 in pkgdict.items():
+                if package1 == package2:
+                    pkgnumfiles = numfiles - 1
+                    pkgsize = size - min(entry[0] for entry in files)
+                    if pkgnumfiles == 0:
+                        continue
+                else:
+                    pkgnumfiles = numfiles
+                    pkgsize = size
+                for function2 in funcdict2.keys():
+                    insert_key = (package1, package2, function1, function2)
+                    add_values(cursor, insert_key, pkgnumfiles, pkgsize)
+
+def main():
+    db = sqlite3.connect("big.sqlite3")
+    cur = db.cursor()
+    cur.execute("DELETE FROM sharing;")
+    readcur = db.cursor()
+    readcur.execute("SELECT hash FROM content GROUP BY hash HAVING count(*) > 1;")
+    for hashvalue, in fetchiter(readcur):
+        cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;",
+                    (hashvalue,))
+        rows = cur.fetchall()
+        print("processing hash %s with %d entries" % (hashvalue, len(rows)))
+        pkgdict = compute_pkgdict(rows)
+        process_pkgdict(cur, pkgdict)
+    db.commit()
+
+if __name__ == "__main__":
+    main()
author	Helmut Grohne <helmut@subdivi.de>	2013-03-02 22:29:04 +0100
committer	Helmut Grohne <helmut@subdivi.de>	2013-03-02 22:29:04 +0100
commit	e296fa30b78d8d52aba74c4a069bc7e061e96dd2 (patch)
tree	c094c1b74601e2d2cb742f8a4ce6e045d60c3f93
parent	c84e9e795a2a31a3e011e13a393b1278b0feb658 (diff)
download	debian-dedup-e296fa30b78d8d52aba74c4a069bc7e061e96dd2.tar.gz