diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-07-26 15:04:02 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-07-26 15:04:02 +0200 |
commit | dc378a18d50142baceaef4c2a416cb5a40f84861 (patch) | |
tree | 975967733a6f7f726618843df11acf766537f9e0 /update_sharing.py | |
parent | 9b653583711c59d96c45af43ff8ee9534500adb6 (diff) | |
parent | 32f406706c0a2a21b11656e5c56ff203e0ee3799 (diff) | |
download | debian-dedup-dc378a18d50142baceaef4c2a416cb5a40f84861.tar.gz |
Merge branch functionid
Actual savings on the full data set are around 7%.
Conflicts:
README
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-x | update_sharing.py | 16 |
1 files changed, 8 insertions, 8 deletions
diff --git a/update_sharing.py b/update_sharing.py index 62a3ab5..4669759 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -5,23 +5,23 @@ import sqlite3 from dedup.utils import fetchiter def add_values(cursor, insert_key, files, size): - cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND func1 = ? AND func2 = ?;", + cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND fid1 = ? AND fid2 = ?;", (files, size) + insert_key) if cursor.rowcount > 0: return - cursor.execute("INSERT INTO sharing (pid1, pid2, func1, func2, files, size) VALUES (?, ?, ?, ?, ?, ?);", + cursor.execute("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (?, ?, ?, ?, ?, ?);", insert_key + (files, size)) def compute_pkgdict(rows): pkgdict = dict() - for pid, _, filename, size, function in rows: + for pid, _, filename, size, fid in rows: funcdict = pkgdict.setdefault(pid, {}) - funcdict.setdefault(function, []).append((size, filename)) + funcdict.setdefault(fid, []).append((size, filename)) return pkgdict def process_pkgdict(cursor, pkgdict): for pid1, funcdict1 in pkgdict.items(): - for function1, files in funcdict1.items(): + for fid1, files in funcdict1.items(): numfiles = len(files) size = sum(entry[0] for entry in files) for pid2, funcdict2 in pkgdict.items(): @@ -33,8 +33,8 @@ def process_pkgdict(cursor, pkgdict): else: pkgnumfiles = numfiles pkgsize = size - for function2 in funcdict2.keys(): - insert_key = (pid1, pid2, function1, function2) + for fid2 in funcdict2.keys(): + insert_key = (pid1, pid2, fid1, fid2) add_values(cursor, insert_key, pkgnumfiles, pkgsize) def main(): @@ -47,7 +47,7 @@ def main(): readcur = db.cursor() readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): - cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", + cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", (hashvalue,)) rows = cur.fetchall() print("processing hash %s with %d entries" % (hashvalue, len(rows))) |