diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-07-17 16:27:08 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-07-17 16:27:08 +0200 |
commit | ed3e611cfc54b8c916e919701070bfd5c6770610 (patch) | |
tree | fe06694d5a2212c87a0d149eccb0f4cbb889a5cd /update_sharing.py | |
parent | a03daac99a237babcd874748d19fc0f809a1dc60 (diff) | |
parent | 6205c89b1e289f04dcea1e6e32fafa6357abf063 (diff) | |
download | debian-dedup-ed3e611cfc54b8c916e919701070bfd5c6770610.tar.gz |
Merge branch master into sqlalchemy
This basically pulls the packageid branch into sqlalchemy. The merge was
complex, because many sql statements diverged. The merge brings us one
step closer to supporting postgres, because an "INSERT OR REPLACE" was
removed from readyaml.py in the packageid branch.
Conflicts:
update_sharing.py
webapp.py
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-x | update_sharing.py | 23 |
1 files changed, 11 insertions, 12 deletions
diff --git a/update_sharing.py b/update_sharing.py index caca445..664b627 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -5,30 +5,29 @@ import sqlalchemy from dedup.utils import fetchiter, enable_sqlite_foreign_keys def add_values(conn, insert_key, files, size): - params = dict(files=files, size=size, package1=insert_key[0], - package2=insert_key[1], func1=insert_key[2], - func2=insert_key[3]) - rows = conn.execute("UPDATE sharing SET files = files + :files, size = size + :size WHERE package1 = :package1 AND package2 = :package2 AND func1 = :func1 AND func2 = :func2;", + params = dict(files=files, size=size, pid1=insert_key[0], + pid2=insert_key[1], func1=insert_key[2], func2=insert_key[3]) + rows = conn.execute("UPDATE sharing SET files = files + :files, size = size + :size WHERE pid1 = :pid1 AND pid2 = :pid2 AND func1 = :func1 AND func2 = :func2;", **params) if rows.rowcount > 0: return - conn.execute("INSERT INTO sharing (package1, package2, func1, func2, files, size) VALUES (:package1, :package2, :func1, :func2, :files, :size);", + conn.execute("INSERT INTO sharing (pid1, pid2, func1, func2, files, size) VALUES (:pid1, :pid2, :func1, :func2, :files, :size);", **params) def compute_pkgdict(rows): pkgdict = dict() - for package, _, filename, size, function in rows: - funcdict = pkgdict.setdefault(package, {}) + for pid, _, filename, size, function in rows: + funcdict = pkgdict.setdefault(pid, {}) funcdict.setdefault(function, []).append((size, filename)) return pkgdict def process_pkgdict(conn, pkgdict): - for package1, funcdict1 in pkgdict.items(): + for pid1, funcdict1 in pkgdict.items(): for function1, files in funcdict1.items(): numfiles = len(files) size = sum(entry[0] for entry in files) - for package2, funcdict2 in pkgdict.items(): - if package1 == package2: + for pid2, funcdict2 in pkgdict.items(): + if pid1 == pid2: pkgnumfiles = numfiles - 1 pkgsize = size - min(entry[0] for entry in files) if pkgnumfiles == 0: @@ -37,7 +36,7 @@ def process_pkgdict(conn, pkgdict): pkgnumfiles = numfiles pkgsize = size for function2 in funcdict2.keys(): - insert_key = (package1, package2, function1, function2) + insert_key = (pid1, pid2, function1, function2) add_values(conn, insert_key, pkgnumfiles, pkgsize) def main(): @@ -48,7 +47,7 @@ def main(): conn.execute("DELETE FROM duplicate;") readcur = conn.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): - rows = conn.execute("SELECT content.package, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;", + rows = conn.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = :hashvalue;", hashvalue=hashvalue).fetchall() print("processing hash %s with %d entries" % (hashvalue, len(rows))) pkgdict = compute_pkgdict(rows) |