#!/usr/bin/python import sqlalchemy from dedup.utils import fetchiter def add_values(conn, insert_key, files, size): rows = conn.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE package1 = ? AND package2 = ? AND func1 = ? AND func2 = ?;", (files, size) + insert_key) if rows.rowcount > 0: return conn.execute("INSERT INTO sharing (package1, package2, func1, func2, files, size) VALUES (?, ?, ?, ?, ?, ?);", insert_key + (files, size)) def compute_pkgdict(rows): pkgdict = dict() for package, _, filename, size, function in rows: funcdict = pkgdict.setdefault(package, {}) funcdict.setdefault(function, []).append((size, filename)) return pkgdict def process_pkgdict(conn, pkgdict): for package1, funcdict1 in pkgdict.items(): for function1, files in funcdict1.items(): numfiles = len(files) size = sum(entry[0] for entry in files) for package2, funcdict2 in pkgdict.items(): if package1 == package2: pkgnumfiles = numfiles - 1 pkgsize = size - min(entry[0] for entry in files) if pkgnumfiles == 0: continue else: pkgnumfiles = numfiles pkgsize = size for function2 in funcdict2.keys(): insert_key = (package1, package2, function1, function2) add_values(conn, insert_key, pkgnumfiles, pkgsize) def main(): db = sqlalchemy.create_engine("sqlite:///test.sqlite3") with db.begin() as conn: conn.execute("PRAGMA foreign_keys = ON;") conn.execute("DELETE FROM sharing;") conn.execute("DELETE FROM duplicate;") readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): rows = conn.execute("SELECT content.package, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", (hashvalue,)).fetchall() print("processing hash %s with %d entries" % (hashvalue, len(rows))) pkgdict = compute_pkgdict(rows) conn.execute("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", *[(row[1],) for row in rows]) process_pkgdict(conn, pkgdict) if __name__ == "__main__": main()