1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
|
#!/usr/bin/python
import optparse
import sqlalchemy
from dedup.utils import fetchiter, configure_database_engine
def add_values(conn, insert_key, files, size):
params = dict(files=files, size=size, pid1=insert_key[0],
pid2=insert_key[1], fid1=insert_key[2], fid2=insert_key[3])
rows = conn.execute(sqlalchemy.text("UPDATE sharing SET files = files + :files, size = size + :size WHERE pid1 = :pid1 AND pid2 = :pid2 AND fid1 = :fid1 AND fid2 = :fid2;"),
**params)
if rows.rowcount > 0:
return
conn.execute(sqlalchemy.text("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (:pid1, :pid2, :fid1, :fid2, :files, :size);"),
**params)
def compute_pkgdict(rows):
pkgdict = dict()
for pid, _, filename, size, fid in rows:
funcdict = pkgdict.setdefault(pid, {})
funcdict.setdefault(fid, []).append((size, filename))
return pkgdict
def process_pkgdict(conn, pkgdict):
for pid1, funcdict1 in pkgdict.items():
for fid1, files in funcdict1.items():
numfiles = len(files)
size = sum(entry[0] for entry in files)
for pid2, funcdict2 in pkgdict.items():
if pid1 == pid2:
pkgnumfiles = numfiles - 1
pkgsize = size - min(entry[0] for entry in files)
if pkgnumfiles == 0:
continue
else:
pkgnumfiles = numfiles
pkgsize = size
for fid2 in funcdict2.keys():
insert_key = (pid1, pid2, fid1, fid2)
add_values(conn, insert_key, pkgnumfiles, pkgsize)
def main(db):
with db.begin() as conn:
conn.execute("DELETE FROM sharing;")
conn.execute("DELETE FROM duplicate;")
conn.execute("DELETE FROM issue;")
readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
for hashvalue, in fetchiter(readcur):
rows = conn.execute(sqlalchemy.text("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = :hashvalue;"),
hashvalue=hashvalue).fetchall()
rowdict = dict()
for row in rows:
rowdict.setdefault(row[0], []).append(row[1:])
for eqclass, rows in rowdict.items():
if len(rows) < 2:
print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass))
continue
print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows)))
pkgdict = compute_pkgdict(rows)
for row in rows:
cid = row[1]
already = conn.scalar(sqlalchemy.text("SELECT cid FROM duplicate WHERE cid = :cid;"),
cid=cid)
if not already:
conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"),
cid=cid)
process_pkgdict(conn, pkgdict)
conn.execute(sqlalchemy.text("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');"))
conn.execute(sqlalchemy.text("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';"))
conn.execute(sqlalchemy.text("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';"))
if __name__ == "__main__":
parser = optparse.OptionParser()
parser.add_option("-d", "--database", action="store",
default="sqlite:///test.sqlite3",
help="location of the database")
options, args = parser.parse_args()
db = sqlalchemy.create_engine(options.database)
configure_database_engine(db)
main(db)
|