summaryrefslogtreecommitdiff
path: root/update_sharing.py
diff options
context:
space:
mode:
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-xupdate_sharing.py22
1 files changed, 13 insertions, 9 deletions
diff --git a/update_sharing.py b/update_sharing.py
index 1e1e06a..6fd83f8 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -6,24 +6,24 @@ from dedup.utils import fetchiter, enable_sqlite_foreign_keys
def add_values(conn, insert_key, files, size):
params = dict(files=files, size=size, pid1=insert_key[0],
- pid2=insert_key[1], func1=insert_key[2], func2=insert_key[3])
- rows = conn.execute(sqlalchemy.text("UPDATE sharing SET files = files + :files, size = size + :size WHERE pid1 = :pid1 AND pid2 = :pid2 AND func1 = :func1 AND func2 = :func2;"),
+ pid2=insert_key[1], fid1=insert_key[2], fid2=insert_key[3])
+ rows = conn.execute(sqlalchemy.text("UPDATE sharing SET files = files + :files, size = size + :size WHERE pid1 = :pid1 AND pid2 = :pid2 AND fid1 = :fid1 AND fid2 = :fid2;"),
**params)
if rows.rowcount > 0:
return
- conn.execute(sqlalchemy.text("INSERT INTO sharing (pid1, pid2, func1, func2, files, size) VALUES (:pid1, :pid2, :func1, :func2, :files, :size);"),
+ conn.execute(sqlalchemy.text("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (:pid1, :pid2, :fid1, :fid2, :files, :size);"),
**params)
def compute_pkgdict(rows):
pkgdict = dict()
- for pid, _, filename, size, function in rows:
+ for pid, _, filename, size, fid in rows:
funcdict = pkgdict.setdefault(pid, {})
- funcdict.setdefault(function, []).append((size, filename))
+ funcdict.setdefault(fid, []).append((size, filename))
return pkgdict
def process_pkgdict(conn, pkgdict):
for pid1, funcdict1 in pkgdict.items():
- for function1, files in funcdict1.items():
+ for fid1, files in funcdict1.items():
numfiles = len(files)
size = sum(entry[0] for entry in files)
for pid2, funcdict2 in pkgdict.items():
@@ -35,8 +35,8 @@ def process_pkgdict(conn, pkgdict):
else:
pkgnumfiles = numfiles
pkgsize = size
- for function2 in funcdict2.keys():
- insert_key = (pid1, pid2, function1, function2)
+ for fid2 in funcdict2.keys():
+ insert_key = (pid1, pid2, fid1, fid2)
add_values(conn, insert_key, pkgnumfiles, pkgsize)
def main():
@@ -45,9 +45,10 @@ def main():
with db.begin() as conn:
conn.execute("DELETE FROM sharing;")
conn.execute("DELETE FROM duplicate;")
+ conn.execute("DELETE FROM issue;")
readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
for hashvalue, in fetchiter(readcur):
- rows = conn.execute(sqlalchemy.text("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;"),
+ rows = conn.execute(sqlalchemy.text("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;"),
hashvalue=hashvalue).fetchall()
print("processing hash %s with %d entries" % (hashvalue, len(rows)))
pkgdict = compute_pkgdict(rows)
@@ -59,6 +60,9 @@ def main():
conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"),
cid=cid)
process_pkgdict(conn, pkgdict)
+ conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")
+ conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")
+ conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")
if __name__ == "__main__":
main()