#!/usr/bin/python3 import argparse import sqlite3 import typing from dedup.utils import fetchiter def add_values( cursor: sqlite3.Cursor, insert_key: typing.Tuple[int, int, int, int], files: int, size: int, ) -> None: cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND fid1 = ? AND fid2 = ?;", (files, size) + insert_key) if cursor.rowcount > 0: return cursor.execute("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (?, ?, ?, ?, ?, ?);", insert_key + (files, size)) def compute_pkgdict( rows: typing.Iterable[typing.Tuple[int, typing.Any, str, int, int]] ) -> typing.Dict[int, typing.Dict[int, typing.List[typing.Tuple[int, str]]]]: pkgdict: typing.Dict[ int, typing.Dict[int, typing.List[typing.Tuple[int, str]]] ] = {} for pid, _, filename, size, fid in rows: funcdict = pkgdict.setdefault(pid, {}) funcdict.setdefault(fid, []).append((size, filename)) return pkgdict def process_pkgdict( cursor: sqlite3.Cursor, pkgdict: typing.Dict[ int, typing.Dict[int, typing.List[typing.Tuple[int, str]]] ], ) -> None: for pid1, funcdict1 in pkgdict.items(): for fid1, files in funcdict1.items(): numfiles = len(files) size = sum(entry[0] for entry in files) for pid2, funcdict2 in pkgdict.items(): if pid1 == pid2: pkgnumfiles = numfiles - 1 pkgsize = size - min(entry[0] for entry in files) if pkgnumfiles == 0: continue else: pkgnumfiles = numfiles pkgsize = size for fid2 in funcdict2.keys(): insert_key = (pid1, pid2, fid1, fid2) add_values(cursor, insert_key, pkgnumfiles, pkgsize) def main(db: sqlite3.Connection) -> None: cur = db.cursor() cur.execute("PRAGMA foreign_keys = ON;") cur.execute("DELETE FROM sharing;") cur.execute("DELETE FROM duplicate;") cur.execute("DELETE FROM issue;") readcur = db.cursor() readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): cur.execute("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = ?;", (hashvalue,)) rowdict: typing.Dict[ int, typing.List[typing.Tuple[int, int, str, int, int]] ] = {} for row in cur.fetchall(): rowdict.setdefault(row[0], []).append(row[1:]) for eqclass, rows in rowdict.items(): if len(rows) < 2: print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass)) continue print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows))) pkgdict = compute_pkgdict(rows) cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", [(row[1],) for row in rows]) process_pkgdict(cur, pkgdict) cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');") cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';") cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';") db.commit() if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-d", "--database", action="store", default="test.sqlite3", help="path to the sqlite3 database file") args = parser.parse_args() main(sqlite3.connect(args.database))