summaryrefslogtreecommitdiff
path: root/update_sharing.py
blob: 1e1e06a39a8f5cc9d03927274c9f0859a113213e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
#!/usr/bin/python

import sqlalchemy

from dedup.utils import fetchiter, enable_sqlite_foreign_keys

def add_values(conn, insert_key, files, size):
    params = dict(files=files, size=size, pid1=insert_key[0],
                  pid2=insert_key[1], func1=insert_key[2], func2=insert_key[3])
    rows = conn.execute(sqlalchemy.text("UPDATE sharing SET files = files + :files, size = size + :size WHERE pid1 = :pid1 AND pid2 = :pid2 AND func1 = :func1 AND func2 = :func2;"),
                        **params)
    if rows.rowcount > 0:
        return
    conn.execute(sqlalchemy.text("INSERT INTO sharing (pid1, pid2, func1, func2, files, size) VALUES (:pid1, :pid2, :func1, :func2, :files, :size);"),
                 **params)

def compute_pkgdict(rows):
    pkgdict = dict()
    for pid, _, filename, size, function in rows:
        funcdict = pkgdict.setdefault(pid, {})
        funcdict.setdefault(function, []).append((size, filename))
    return pkgdict

def process_pkgdict(conn, pkgdict):
    for pid1, funcdict1 in pkgdict.items():
        for function1, files in funcdict1.items():
            numfiles = len(files)
            size = sum(entry[0] for entry in files)
            for pid2, funcdict2 in pkgdict.items():
                if pid1 == pid2:
                    pkgnumfiles = numfiles - 1
                    pkgsize = size - min(entry[0] for entry in files)
                    if pkgnumfiles == 0:
                        continue
                else:
                    pkgnumfiles = numfiles
                    pkgsize = size
                for function2 in funcdict2.keys():
                    insert_key = (pid1, pid2, function1, function2)
                    add_values(conn, insert_key, pkgnumfiles, pkgsize)

def main():
    db = sqlalchemy.create_engine("sqlite:///test.sqlite3")
    enable_sqlite_foreign_keys(db)
    with db.begin() as conn:
        conn.execute("DELETE FROM sharing;")
        conn.execute("DELETE FROM duplicate;")
        readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
        for hashvalue, in fetchiter(readcur):
            rows = conn.execute(sqlalchemy.text("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;"),
                                hashvalue=hashvalue).fetchall()
            print("processing hash %s with %d entries" % (hashvalue, len(rows)))
            pkgdict = compute_pkgdict(rows)
            for row in rows:
                cid = row[1]
                already = conn.scalar(sqlalchemy.text("SELECT cid FROM duplicate WHERE cid = :cid;"),
                                      cid=cid)
                if not already:
                    conn.execute(sqlalchemy.text("INSERT INTO duplicate (cid) VALUES (:cid);"),
                                 cid=cid)
            process_pkgdict(conn, pkgdict)

if __name__ == "__main__":
    main()