summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-03-02 22:29:04 +0100
committerHelmut Grohne <helmut@subdivi.de>2013-03-02 22:29:04 +0100
commite296fa30b78d8d52aba74c4a069bc7e061e96dd2 (patch)
treec094c1b74601e2d2cb742f8a4ce6e045d60c3f93
parentc84e9e795a2a31a3e011e13a393b1278b0feb658 (diff)
downloaddebian-dedup-e296fa30b78d8d52aba74c4a069bc7e061e96dd2.tar.gz
add sharing table
The sharing table is a cache for the /binary web pages. It essentially contains the numbers presented. This caching table is not automatically populated. It needs to be reconstructed after every (group of) package imports.
-rw-r--r--schema.sql4
-rwxr-xr-xupdate_sharing.py56
2 files changed, 60 insertions, 0 deletions
diff --git a/schema.sql b/schema.sql
index 3f3a08e..b5397ee 100644
--- a/schema.sql
+++ b/schema.sql
@@ -3,3 +3,7 @@ CREATE TABLE content (package TEXT, filename TEXT, size INTEGER, function TEXT,
CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package), FOREIGN KEY (required) REFERENCES package(package));
CREATE INDEX content_package_index ON content (package);
CREATE INDEX content_hash_index ON content (hash);
+
+CREATE TABLE sharing (package1 TEXT, package2 TEXT, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER);
+CREATE INDEX sharing_insert_index ON sharing (package1, package2, func1, func2);
+CREATE INDEX sharing_package_index ON sharing (package1);
diff --git a/update_sharing.py b/update_sharing.py
new file mode 100755
index 0000000..92da945
--- /dev/null
+++ b/update_sharing.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python
+
+import sqlite3
+
+from dedup.utils import fetchiter
+
+def add_values(cursor, insert_key, files, size):
+ cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE package1 = ? AND package2 = ? AND func1 = ? AND func2 = ?;",
+ (files, size) + insert_key)
+ if cursor.rowcount > 0:
+ return
+ cursor.execute("INSERT INTO sharing (package1, package2, func1, func2, files, size) VALUES (?, ?, ?, ?, ?, ?);",
+ insert_key + (files, size))
+
+def compute_pkgdict(rows):
+ pkgdict = dict()
+ for package, filename, size, function in rows:
+ funcdict = pkgdict.setdefault(package, {})
+ funcdict.setdefault(function, []).append((size, filename))
+ return pkgdict
+
+def process_pkgdict(cursor, pkgdict):
+ for package1, funcdict1 in pkgdict.items():
+ for function1, files in funcdict1.items():
+ numfiles = len(files)
+ size = sum(entry[0] for entry in files)
+ for package2, funcdict2 in pkgdict.items():
+ if package1 == package2:
+ pkgnumfiles = numfiles - 1
+ pkgsize = size - min(entry[0] for entry in files)
+ if pkgnumfiles == 0:
+ continue
+ else:
+ pkgnumfiles = numfiles
+ pkgsize = size
+ for function2 in funcdict2.keys():
+ insert_key = (package1, package2, function1, function2)
+ add_values(cursor, insert_key, pkgnumfiles, pkgsize)
+
+def main():
+ db = sqlite3.connect("big.sqlite3")
+ cur = db.cursor()
+ cur.execute("DELETE FROM sharing;")
+ readcur = db.cursor()
+ readcur.execute("SELECT hash FROM content GROUP BY hash HAVING count(*) > 1;")
+ for hashvalue, in fetchiter(readcur):
+ cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;",
+ (hashvalue,))
+ rows = cur.fetchall()
+ print("processing hash %s with %d entries" % (hashvalue, len(rows)))
+ pkgdict = compute_pkgdict(rows)
+ process_pkgdict(cur, pkgdict)
+ db.commit()
+
+if __name__ == "__main__":
+ main()