Merge branch functionid

Actual savings on the full data set are around 7%. Conflicts: README
author: Helmut Grohne <helmut@subdivi.de> 2013-07-26 15:04:02 +0200
committer: Helmut Grohne <helmut@subdivi.de> 2013-07-26 15:04:02 +0200
commit: dc378a18d50142baceaef4c2a416cb5a40f84861 (patch)
tree: 975967733a6f7f726618843df11acf766537f9e0 /update_sharing.py
parent: 9b653583711c59d96c45af43ff8ee9534500adb6 (diff)
parent: 32f406706c0a2a21b11656e5c56ff203e0ee3799 (diff)
download: debian-dedup-dc378a18d50142baceaef4c2a416cb5a40f84861.tar.gz
1 files changed, 8 insertions, 8 deletions
diff --git a/update_sharing.py b/update_sharing.py
index 62a3ab5..4669759 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -5,23 +5,23 @@ import sqlite3
 from dedup.utils import fetchiter
 
 def add_values(cursor, insert_key, files, size):
-    cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND func1 = ? AND func2 = ?;",
+    cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND fid1 = ? AND fid2 = ?;",
                    (files, size) + insert_key)
     if cursor.rowcount > 0:
         return
-    cursor.execute("INSERT INTO sharing (pid1, pid2, func1, func2, files, size) VALUES (?, ?, ?, ?, ?, ?);",
+    cursor.execute("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (?, ?, ?, ?, ?, ?);",
                    insert_key + (files, size))
 
 def compute_pkgdict(rows):
     pkgdict = dict()
-    for pid, _, filename, size, function in rows:
+    for pid, _, filename, size, fid in rows:
         funcdict = pkgdict.setdefault(pid, {})
-        funcdict.setdefault(function, []).append((size, filename))
+        funcdict.setdefault(fid, []).append((size, filename))
     return pkgdict
 
 def process_pkgdict(cursor, pkgdict):
     for pid1, funcdict1 in pkgdict.items():
-        for function1, files in funcdict1.items():
+        for fid1, files in funcdict1.items():
             numfiles = len(files)
             size = sum(entry[0] for entry in files)
             for pid2, funcdict2 in pkgdict.items():
@@ -33,8 +33,8 @@ def process_pkgdict(cursor, pkgdict):
                 else:
                     pkgnumfiles = numfiles
                     pkgsize = size
-                for function2 in funcdict2.keys():
-                    insert_key = (pid1, pid2, function1, function2)
+                for fid2 in funcdict2.keys():
+                    insert_key = (pid1, pid2, fid1, fid2)
                     add_values(cursor, insert_key, pkgnumfiles, pkgsize)
 
 def main():
@@ -47,7 +47,7 @@ def main():
     readcur = db.cursor()
     readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
     for hashvalue, in fetchiter(readcur):
-        cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
+        cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
                     (hashvalue,))
         rows = cur.fetchall()
         print("processing hash %s with %d entries" % (hashvalue, len(rows)))
author	Helmut Grohne <helmut@subdivi.de>	2013-07-26 15:04:02 +0200
committer	Helmut Grohne <helmut@subdivi.de>	2013-07-26 15:04:02 +0200
commit	dc378a18d50142baceaef4c2a416cb5a40f84861 (patch)
tree	975967733a6f7f726618843df11acf766537f9e0 /update_sharing.py
parent	9b653583711c59d96c45af43ff8ee9534500adb6 (diff)
parent	32f406706c0a2a21b11656e5c56ff203e0ee3799 (diff)
download	debian-dedup-dc378a18d50142baceaef4c2a416cb5a40f84861.tar.gz