summaryrefslogtreecommitdiff
path: root/update_sharing.py
diff options
context:
space:
mode:
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-xupdate_sharing.py85
1 files changed, 46 insertions, 39 deletions
diff --git a/update_sharing.py b/update_sharing.py
index ca6890b..450bfc7 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -1,17 +1,20 @@
#!/usr/bin/python
import optparse
-import sqlite3
-from dedup.utils import fetchiter
+import sqlalchemy
-def add_values(cursor, insert_key, files, size):
- cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND fid1 = ? AND fid2 = ?;",
- (files, size) + insert_key)
- if cursor.rowcount > 0:
+from dedup.utils import fetchiter, enable_sqlite_foreign_keys
+
+def add_values(conn, insert_key, files, size):
+ params = dict(files=files, size=size, pid1=insert_key[0],
+ pid2=insert_key[1], fid1=insert_key[2], fid2=insert_key[3])
+ rows = conn.execute(sqlalchemy.text("UPDATE sharing SET files = files + :files, size = size + :size WHERE pid1 = :pid1 AND pid2 = :pid2 AND fid1 = :fid1 AND fid2 = :fid2;"),
+ **params)
+ if rows.rowcount > 0:
return
- cursor.execute("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (?, ?, ?, ?, ?, ?);",
- insert_key + (files, size))
+ conn.execute(sqlalchemy.text("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (:pid1, :pid2, :fid1, :fid2, :files, :size);"),
+ **params)
def compute_pkgdict(rows):
pkgdict = dict()
@@ -20,7 +23,7 @@ def compute_pkgdict(rows):
funcdict.setdefault(fid, []).append((size, filename))
return pkgdict
-def process_pkgdict(cursor, pkgdict):
+def process_pkgdict(conn, pkgdict):
for pid1, funcdict1 in pkgdict.items():
for fid1, files in funcdict1.items():
numfiles = len(files)
@@ -36,40 +39,44 @@ def process_pkgdict(cursor, pkgdict):
pkgsize = size
for fid2 in funcdict2.keys():
insert_key = (pid1, pid2, fid1, fid2)
- add_values(cursor, insert_key, pkgnumfiles, pkgsize)
+ add_values(conn, insert_key, pkgnumfiles, pkgsize)
def main(db):
- cur = db.cursor()
- cur.execute("PRAGMA foreign_keys = ON;")
- cur.execute("DELETE FROM sharing;")
- cur.execute("DELETE FROM duplicate;")
- cur.execute("DELETE FROM issue;")
- readcur = db.cursor()
- readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
- for hashvalue, in fetchiter(readcur):
- cur.execute("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = ?;",
- (hashvalue,))
- rowdict = dict()
- for row in cur.fetchall():
- rowdict.setdefault(row[0], []).append(row[1:])
- for eqclass, rows in rowdict.items():
- if len(rows) < 2:
- print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass))
- continue
- print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows)))
- pkgdict = compute_pkgdict(rows)
- cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
- [(row[1],) for row in rows])
- process_pkgdict(cur, pkgdict)
- cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")
- cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")
- cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")
- db.commit()
+ with db.begin() as conn:
+ conn.execute("DELETE FROM sharing;")
+ conn.execute("DELETE FROM duplicate;")
+ conn.execute("DELETE FROM issue;")
+ readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
+ for hashvalue, in fetchiter(readcur):
+ rows = conn.execute(sqlalchemy.text("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = :hashvalue;"),
+ hashvalue=hashvalue).fetchall()
+ rowdict = dict()
+ for row in rows:
+ rowdict.setdefault(row[0], []).append(row[1:])
+ for eqclass, rows in rowdict.items():
+ if len(rows) < 2:
+ print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass))
+ continue
+ print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows)))
+ pkgdict = compute_pkgdict(rows)
+ for row in rows:
+ cid = row[1]
+ already = conn.scalar(sqlalchemy.text("SELECT cid FROM duplicate WHERE cid = :cid;"),
+ cid=cid)
+ if not already:
+ conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"),
+ cid=cid)
+ process_pkgdict(conn, pkgdict)
+ conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")
+ conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")
+ conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")
if __name__ == "__main__":
parser = optparse.OptionParser()
parser.add_option("-d", "--database", action="store",
- default="test.sqlite3",
- help="path to the sqlite3 database file")
+ default="sqlite:///test.sqlite3",
+ help="location of the database")
options, args = parser.parse_args()
- main(sqlite3.connect(options.database))
+ db = sqlalchemy.create_engine(options.database)
+ enable_sqlite_foreign_keys(db)
+ main(db)