#!/usr/bin/python import optparse import sqlalchemy from dedup.utils import fetchiter, configure_database_engine def add_values(conn, insert_key, files, size): params = dict(files=files, size=size, pid1=insert_key[0], pid2=insert_key[1], fid1=insert_key[2], fid2=insert_key[3]) rows = conn.execute(sqlalchemy.text("UPDATE sharing SET files = files + :files, size = size + :size WHERE pid1 = :pid1 AND pid2 = :pid2 AND fid1 = :fid1 AND fid2 = :fid2;"), **params) if rows.rowcount > 0: return conn.execute(sqlalchemy.text("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (:pid1, :pid2, :fid1, :fid2, :files, :size);"), **params) def compute_pkgdict(rows): pkgdict = dict() for pid, _, filename, size, fid in rows: funcdict = pkgdict.setdefault(pid, {}) funcdict.setdefault(fid, []).append((size, filename)) return pkgdict def process_pkgdict(conn, pkgdict): for pid1, funcdict1 in pkgdict.items(): for fid1, files in funcdict1.items(): numfiles = len(files) size = sum(entry[0] for entry in files) for pid2, funcdict2 in pkgdict.items(): if pid1 == pid2: pkgnumfiles = numfiles - 1 pkgsize = size - min(entry[0] for entry in files) if pkgnumfiles == 0: continue else: pkgnumfiles = numfiles pkgsize = size for fid2 in funcdict2.keys(): insert_key = (pid1, pid2, fid1, fid2) add_values(conn, insert_key, pkgnumfiles, pkgsize) def main(db): with db.begin() as conn: conn.execute("DELETE FROM sharing;") conn.execute("DELETE FROM duplicate;") conn.execute("DELETE FROM issue;") readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): rows = conn.execute(sqlalchemy.text("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = :hashvalue;"), hashvalue=hashvalue).fetchall() rowdict = dict() for row in rows: rowdict.setdefault(row[0], []).append(row[1:]) for eqclass, rows in rowdict.items(): if len(rows) < 2: print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass)) continue print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows))) pkgdict = compute_pkgdict(rows) for row in rows: cid = row[1] already = conn.scalar(sqlalchemy.text("SELECT cid FROM duplicate WHERE cid = :cid;"), cid=cid) if not already: conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"), cid=cid) process_pkgdict(conn, pkgdict) conn.execute(sqlalchemy.text("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")) conn.execute(sqlalchemy.text("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")) conn.execute(sqlalchemy.text("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")) if __name__ == "__main__": parser = optparse.OptionParser() parser.add_option("-d", "--database", action="store", default="sqlite:///test.sqlite3", help="location of the database") options, args = parser.parse_args() db = sqlalchemy.create_engine(options.database) configure_database_engine(db) main(db)