summaryrefslogtreecommitdiff
path: root/update_sharing.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-03-02 22:29:04 +0100
committerHelmut Grohne <helmut@subdivi.de>2013-03-02 22:29:04 +0100
commite296fa30b78d8d52aba74c4a069bc7e061e96dd2 (patch)
treec094c1b74601e2d2cb742f8a4ce6e045d60c3f93 /update_sharing.py
parentc84e9e795a2a31a3e011e13a393b1278b0feb658 (diff)
downloaddebian-dedup-e296fa30b78d8d52aba74c4a069bc7e061e96dd2.tar.gz
add sharing table
The sharing table is a cache for the /binary web pages. It essentially contains the numbers presented. This caching table is not automatically populated. It needs to be reconstructed after every (group of) package imports.
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-xupdate_sharing.py56
1 files changed, 56 insertions, 0 deletions
diff --git a/update_sharing.py b/update_sharing.py
new file mode 100755
index 0000000..92da945
--- /dev/null
+++ b/update_sharing.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python
+
+import sqlite3
+
+from dedup.utils import fetchiter
+
+def add_values(cursor, insert_key, files, size):
+ cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE package1 = ? AND package2 = ? AND func1 = ? AND func2 = ?;",
+ (files, size) + insert_key)
+ if cursor.rowcount > 0:
+ return
+ cursor.execute("INSERT INTO sharing (package1, package2, func1, func2, files, size) VALUES (?, ?, ?, ?, ?, ?);",
+ insert_key + (files, size))
+
+def compute_pkgdict(rows):
+ pkgdict = dict()
+ for package, filename, size, function in rows:
+ funcdict = pkgdict.setdefault(package, {})
+ funcdict.setdefault(function, []).append((size, filename))
+ return pkgdict
+
+def process_pkgdict(cursor, pkgdict):
+ for package1, funcdict1 in pkgdict.items():
+ for function1, files in funcdict1.items():
+ numfiles = len(files)
+ size = sum(entry[0] for entry in files)
+ for package2, funcdict2 in pkgdict.items():
+ if package1 == package2:
+ pkgnumfiles = numfiles - 1
+ pkgsize = size - min(entry[0] for entry in files)
+ if pkgnumfiles == 0:
+ continue
+ else:
+ pkgnumfiles = numfiles
+ pkgsize = size
+ for function2 in funcdict2.keys():
+ insert_key = (package1, package2, function1, function2)
+ add_values(cursor, insert_key, pkgnumfiles, pkgsize)
+
+def main():
+ db = sqlite3.connect("big.sqlite3")
+ cur = db.cursor()
+ cur.execute("DELETE FROM sharing;")
+ readcur = db.cursor()
+ readcur.execute("SELECT hash FROM content GROUP BY hash HAVING count(*) > 1;")
+ for hashvalue, in fetchiter(readcur):
+ cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;",
+ (hashvalue,))
+ rows = cur.fetchall()
+ print("processing hash %s with %d entries" % (hashvalue, len(rows)))
+ pkgdict = compute_pkgdict(rows)
+ process_pkgdict(cur, pkgdict)
+ db.commit()
+
+if __name__ == "__main__":
+ main()