diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-03-02 22:29:04 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-03-02 22:29:04 +0100 |
commit | e296fa30b78d8d52aba74c4a069bc7e061e96dd2 (patch) | |
tree | c094c1b74601e2d2cb742f8a4ce6e045d60c3f93 /update_sharing.py | |
parent | c84e9e795a2a31a3e011e13a393b1278b0feb658 (diff) | |
download | debian-dedup-e296fa30b78d8d52aba74c4a069bc7e061e96dd2.tar.gz |
add sharing table
The sharing table is a cache for the /binary web pages. It essentially
contains the numbers presented. This caching table is not automatically
populated. It needs to be reconstructed after every (group of) package
imports.
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-x | update_sharing.py | 56 |
1 files changed, 56 insertions, 0 deletions
diff --git a/update_sharing.py b/update_sharing.py new file mode 100755 index 0000000..92da945 --- /dev/null +++ b/update_sharing.py @@ -0,0 +1,56 @@ +#!/usr/bin/python + +import sqlite3 + +from dedup.utils import fetchiter + +def add_values(cursor, insert_key, files, size): + cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE package1 = ? AND package2 = ? AND func1 = ? AND func2 = ?;", + (files, size) + insert_key) + if cursor.rowcount > 0: + return + cursor.execute("INSERT INTO sharing (package1, package2, func1, func2, files, size) VALUES (?, ?, ?, ?, ?, ?);", + insert_key + (files, size)) + +def compute_pkgdict(rows): + pkgdict = dict() + for package, filename, size, function in rows: + funcdict = pkgdict.setdefault(package, {}) + funcdict.setdefault(function, []).append((size, filename)) + return pkgdict + +def process_pkgdict(cursor, pkgdict): + for package1, funcdict1 in pkgdict.items(): + for function1, files in funcdict1.items(): + numfiles = len(files) + size = sum(entry[0] for entry in files) + for package2, funcdict2 in pkgdict.items(): + if package1 == package2: + pkgnumfiles = numfiles - 1 + pkgsize = size - min(entry[0] for entry in files) + if pkgnumfiles == 0: + continue + else: + pkgnumfiles = numfiles + pkgsize = size + for function2 in funcdict2.keys(): + insert_key = (package1, package2, function1, function2) + add_values(cursor, insert_key, pkgnumfiles, pkgsize) + +def main(): + db = sqlite3.connect("big.sqlite3") + cur = db.cursor() + cur.execute("DELETE FROM sharing;") + readcur = db.cursor() + readcur.execute("SELECT hash FROM content GROUP BY hash HAVING count(*) > 1;") + for hashvalue, in fetchiter(readcur): + cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;", + (hashvalue,)) + rows = cur.fetchall() + print("processing hash %s with %d entries" % (hashvalue, len(rows))) + pkgdict = compute_pkgdict(rows) + process_pkgdict(cur, pkgdict) + db.commit() + +if __name__ == "__main__": + main() |