summaryrefslogtreecommitdiff
path: root/update_sharing.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-08-02 08:40:49 +0200
committerHelmut Grohne <helmut@subdivi.de>2013-08-02 08:40:49 +0200
commitcb3708825bf7ea32314040575cef35980dad0cd8 (patch)
tree31575a8525dc90ba6904268d94f47e1604bf0557 /update_sharing.py
parenta4bbbb6e664e605634cb3f9e0564c7e4a93697be (diff)
parent2712edb550968ce7ec8cd9800241d7944666631a (diff)
downloaddebian-dedup-cb3708825bf7ea32314040575cef35980dad0cd8.tar.gz
Merge branch master into sqlalchemy
This makes the sqlalchemy branch schema-compatible with master again. The biggest change on master was the introduction of the function table. It caused most of the conflicts. Note that webapp had one conflict not detected by git: The selecting of issues in show_package needed sqlalchemy conversion. Conflicts: README update_sharing.py webapp.py
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-xupdate_sharing.py22
1 files changed, 13 insertions, 9 deletions
diff --git a/update_sharing.py b/update_sharing.py
index 1e1e06a..6fd83f8 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -6,24 +6,24 @@ from dedup.utils import fetchiter, enable_sqlite_foreign_keys
def add_values(conn, insert_key, files, size):
params = dict(files=files, size=size, pid1=insert_key[0],
- pid2=insert_key[1], func1=insert_key[2], func2=insert_key[3])
- rows = conn.execute(sqlalchemy.text("UPDATE sharing SET files = files + :files, size = size + :size WHERE pid1 = :pid1 AND pid2 = :pid2 AND func1 = :func1 AND func2 = :func2;"),
+ pid2=insert_key[1], fid1=insert_key[2], fid2=insert_key[3])
+ rows = conn.execute(sqlalchemy.text("UPDATE sharing SET files = files + :files, size = size + :size WHERE pid1 = :pid1 AND pid2 = :pid2 AND fid1 = :fid1 AND fid2 = :fid2;"),
**params)
if rows.rowcount > 0:
return
- conn.execute(sqlalchemy.text("INSERT INTO sharing (pid1, pid2, func1, func2, files, size) VALUES (:pid1, :pid2, :func1, :func2, :files, :size);"),
+ conn.execute(sqlalchemy.text("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (:pid1, :pid2, :fid1, :fid2, :files, :size);"),
**params)
def compute_pkgdict(rows):
pkgdict = dict()
- for pid, _, filename, size, function in rows:
+ for pid, _, filename, size, fid in rows:
funcdict = pkgdict.setdefault(pid, {})
- funcdict.setdefault(function, []).append((size, filename))
+ funcdict.setdefault(fid, []).append((size, filename))
return pkgdict
def process_pkgdict(conn, pkgdict):
for pid1, funcdict1 in pkgdict.items():
- for function1, files in funcdict1.items():
+ for fid1, files in funcdict1.items():
numfiles = len(files)
size = sum(entry[0] for entry in files)
for pid2, funcdict2 in pkgdict.items():
@@ -35,8 +35,8 @@ def process_pkgdict(conn, pkgdict):
else:
pkgnumfiles = numfiles
pkgsize = size
- for function2 in funcdict2.keys():
- insert_key = (pid1, pid2, function1, function2)
+ for fid2 in funcdict2.keys():
+ insert_key = (pid1, pid2, fid1, fid2)
add_values(conn, insert_key, pkgnumfiles, pkgsize)
def main():
@@ -45,9 +45,10 @@ def main():
with db.begin() as conn:
conn.execute("DELETE FROM sharing;")
conn.execute("DELETE FROM duplicate;")
+ conn.execute("DELETE FROM issue;")
readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
for hashvalue, in fetchiter(readcur):
- rows = conn.execute(sqlalchemy.text("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;"),
+ rows = conn.execute(sqlalchemy.text("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;"),
hashvalue=hashvalue).fetchall()
print("processing hash %s with %d entries" % (hashvalue, len(rows)))
pkgdict = compute_pkgdict(rows)
@@ -59,6 +60,9 @@ def main():
conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"),
cid=cid)
process_pkgdict(conn, pkgdict)
+ conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")
+ conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")
+ conn.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")
if __name__ == "__main__":
main()