diff options
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-x | update_sharing.py | 23 |
1 files changed, 11 insertions, 12 deletions
diff --git a/update_sharing.py b/update_sharing.py index caca445..664b627 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -5,30 +5,29 @@ import sqlalchemy from dedup.utils import fetchiter, enable_sqlite_foreign_keys def add_values(conn, insert_key, files, size): - params = dict(files=files, size=size, package1=insert_key[0], - package2=insert_key[1], func1=insert_key[2], - func2=insert_key[3]) - rows = conn.execute("UPDATE sharing SET files = files + :files, size = size + :size WHERE package1 = :package1 AND package2 = :package2 AND func1 = :func1 AND func2 = :func2;", + params = dict(files=files, size=size, pid1=insert_key[0], + pid2=insert_key[1], func1=insert_key[2], func2=insert_key[3]) + rows = conn.execute("UPDATE sharing SET files = files + :files, size = size + :size WHERE pid1 = :pid1 AND pid2 = :pid2 AND func1 = :func1 AND func2 = :func2;", **params) if rows.rowcount > 0: return - conn.execute("INSERT INTO sharing (package1, package2, func1, func2, files, size) VALUES (:package1, :package2, :func1, :func2, :files, :size);", + conn.execute("INSERT INTO sharing (pid1, pid2, func1, func2, files, size) VALUES (:pid1, :pid2, :func1, :func2, :files, :size);", **params) def compute_pkgdict(rows): pkgdict = dict() - for package, _, filename, size, function in rows: - funcdict = pkgdict.setdefault(package, {}) + for pid, _, filename, size, function in rows: + funcdict = pkgdict.setdefault(pid, {}) funcdict.setdefault(function, []).append((size, filename)) return pkgdict def process_pkgdict(conn, pkgdict): - for package1, funcdict1 in pkgdict.items(): + for pid1, funcdict1 in pkgdict.items(): for function1, files in funcdict1.items(): numfiles = len(files) size = sum(entry[0] for entry in files) - for package2, funcdict2 in pkgdict.items(): - if package1 == package2: + for pid2, funcdict2 in pkgdict.items(): + if pid1 == pid2: pkgnumfiles = numfiles - 1 pkgsize = size - min(entry[0] for entry in files) if pkgnumfiles == 0: @@ -37,7 +36,7 @@ def process_pkgdict(conn, pkgdict): pkgnumfiles = numfiles pkgsize = size for function2 in funcdict2.keys(): - insert_key = (package1, package2, function1, function2) + insert_key = (pid1, pid2, function1, function2) add_values(conn, insert_key, pkgnumfiles, pkgsize) def main(): @@ -48,7 +47,7 @@ def main(): conn.execute("DELETE FROM duplicate;") readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): - rows = conn.execute("SELECT content.package, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;", + rows = conn.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;", hashvalue=hashvalue).fetchall() print("processing hash %s with %d entries" % (hashvalue, len(rows))) pkgdict = compute_pkgdict(rows) |