diff options
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-x | update_sharing.py | 85 |
1 files changed, 46 insertions, 39 deletions
diff --git a/update_sharing.py b/update_sharing.py index ca6890b..450bfc7 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -1,17 +1,20 @@ #!/usr/bin/python import optparse -import sqlite3 -from dedup.utils import fetchiter +import sqlalchemy -def add_values(cursor, insert_key, files, size): - cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND fid1 = ? AND fid2 = ?;", - (files, size) + insert_key) - if cursor.rowcount > 0: +from dedup.utils import fetchiter, enable_sqlite_foreign_keys + +def add_values(conn, insert_key, files, size): + params = dict(files=files, size=size, pid1=insert_key[0], + pid2=insert_key[1], fid1=insert_key[2], fid2=insert_key[3]) + rows = conn.execute(sqlalchemy.text("UPDATE sharing SET files = files + :files, size = size + :size WHERE pid1 = :pid1 AND pid2 = :pid2 AND fid1 = :fid1 AND fid2 = :fid2;"), + **params) + if rows.rowcount > 0: return - cursor.execute("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (?, ?, ?, ?, ?, ?);", - insert_key + (files, size)) + conn.execute(sqlalchemy.text("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (:pid1, :pid2, :fid1, :fid2, :files, :size);"), + **params) def compute_pkgdict(rows): pkgdict = dict() @@ -20,7 +23,7 @@ def compute_pkgdict(rows): funcdict.setdefault(fid, []).append((size, filename)) return pkgdict -def process_pkgdict(cursor, pkgdict): +def process_pkgdict(conn, pkgdict): for pid1, funcdict1 in pkgdict.items(): for fid1, files in funcdict1.items(): numfiles = len(files) @@ -36,40 +39,44 @@ def process_pkgdict(cursor, pkgdict): pkgsize = size for fid2 in funcdict2.keys(): insert_key = (pid1, pid2, fid1, fid2) - add_values(cursor, insert_key, pkgnumfiles, pkgsize) + add_values(conn, insert_key, pkgnumfiles, pkgsize) def main(db): - cur = db.cursor() - cur.execute("PRAGMA foreign_keys = ON;") - cur.execute("DELETE FROM sharing;") - cur.execute("DELETE FROM duplicate;") - cur.execute("DELETE FROM issue;") - readcur = db.cursor() - readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") - for hashvalue, in fetchiter(readcur): - cur.execute("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = ?;", - (hashvalue,)) - rowdict = dict() - for row in cur.fetchall(): - rowdict.setdefault(row[0], []).append(row[1:]) - for eqclass, rows in rowdict.items(): - if len(rows) < 2: - print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass)) - continue - print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows))) - pkgdict = compute_pkgdict(rows) - cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", - [(row[1],) for row in rows]) - process_pkgdict(cur, pkgdict) - cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');") - cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';") - cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';") - db.commit() + with db.begin() as conn: + conn.execute("DELETE FROM sharing;") + conn.execute("DELETE FROM duplicate;") + conn.execute("DELETE FROM issue;") + readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") + for hashvalue, in fetchiter(readcur): + rows = conn.execute(sqlalchemy.text("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = :hashvalue;"), + hashvalue=hashvalue).fetchall() + rowdict = dict() + for row in rows: + rowdict.setdefault(row[0], []).append(row[1:]) + for eqclass, rows in rowdict.items(): + if len(rows) < 2: + print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass)) + continue + print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows))) + pkgdict = compute_pkgdict(rows) + for row in rows: + cid = row[1] + already = conn.scalar(sqlalchemy.text("SELECT cid FROM duplicate WHERE cid = :cid;"), + cid=cid) + if not already: + conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"), + cid=cid) + process_pkgdict(conn, pkgdict) + conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');") + conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';") + conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';") if __name__ == "__main__": parser = optparse.OptionParser() parser.add_option("-d", "--database", action="store", - default="test.sqlite3", - help="path to the sqlite3 database file") + default="sqlite:///test.sqlite3", + help="location of the database") options, args = parser.parse_args() - main(sqlite3.connect(options.database)) + db = sqlalchemy.create_engine(options.database) + enable_sqlite_foreign_keys(db) + main(db) |