1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
|
#!/usr/bin/python
import sqlalchemy
from dedup.utils import fetchiter, enable_sqlite_foreign_keys
def add_values(conn, insert_key, files, size):
params = dict(files=files, size=size, package1=insert_key[0],
package2=insert_key[1], func1=insert_key[2],
func2=insert_key[3])
rows = conn.execute("UPDATE sharing SET files = files + :files, size = size + :size WHERE package1 = :package1 AND package2 = :package2 AND func1 = :func1 AND func2 = :func2;",
**params)
if rows.rowcount > 0:
return
conn.execute("INSERT INTO sharing (package1, package2, func1, func2, files, size) VALUES (:package1, :package2, :func1, :func2, :files, :size);",
**params)
def compute_pkgdict(rows):
pkgdict = dict()
for package, _, filename, size, function in rows:
funcdict = pkgdict.setdefault(package, {})
funcdict.setdefault(function, []).append((size, filename))
return pkgdict
def process_pkgdict(conn, pkgdict):
for package1, funcdict1 in pkgdict.items():
for function1, files in funcdict1.items():
numfiles = len(files)
size = sum(entry[0] for entry in files)
for package2, funcdict2 in pkgdict.items():
if package1 == package2:
pkgnumfiles = numfiles - 1
pkgsize = size - min(entry[0] for entry in files)
if pkgnumfiles == 0:
continue
else:
pkgnumfiles = numfiles
pkgsize = size
for function2 in funcdict2.keys():
insert_key = (package1, package2, function1, function2)
add_values(conn, insert_key, pkgnumfiles, pkgsize)
def main():
db = sqlalchemy.create_engine("sqlite:///test.sqlite3")
enable_sqlite_foreign_keys(db)
with db.begin() as conn:
conn.execute("DELETE FROM sharing;")
conn.execute("DELETE FROM duplicate;")
readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
for hashvalue, in fetchiter(readcur):
rows = conn.execute("SELECT content.package, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;",
hashvalue=hashvalue).fetchall()
print("processing hash %s with %d entries" % (hashvalue, len(rows)))
pkgdict = compute_pkgdict(rows)
for row in rows:
cid = row[1]
already = conn.scalar("SELECT cid FROM duplicate WHERE cid = :cid;",
cid=cid)
if not already:
conn.execute("INSERT INTO duplicate (cid) VALUES (:cid);",
cid=cid)
process_pkgdict(conn, pkgdict)
if __name__ == "__main__":
main()
|