summaryrefslogtreecommitdiff
path: root/update_sharing.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-04-24 20:56:46 +0200
committerHelmut Grohne <helmut@subdivi.de>2013-04-24 21:00:20 +0200
commit94eb867119af05639691ec7990dcf2d6a956dd86 (patch)
tree6f33e5f2badf1b19182c718f46614869047516cb /update_sharing.py
parentd2b83735a4810cec7bf7c0dd6fb521498f104435 (diff)
downloaddebian-dedup-94eb867119af05639691ec7990dcf2d6a956dd86.tar.gz
implement the /compare/pkg1/pkg2 page differently
The original version had two major drawbacks: 1) The SQL query used would cause a btree sort, so the time waiting for the first output was rather long. 2) For packages with many equal files, the output would grow with O(n^2). Thanks to the suggestions by Christine Grohne and Klaus Aehlig. The approach now groups files in package1 by their main hash value (sha512). It also does some work SQL was designed to solve manually now. To speed up page generation a new caching table was added identifying which files have corresponding shared files.
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-xupdate_sharing.py7
1 files changed, 5 insertions, 2 deletions
diff --git a/update_sharing.py b/update_sharing.py
index b45e40b..d2b357b 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -14,7 +14,7 @@ def add_values(cursor, insert_key, files, size):
def compute_pkgdict(rows):
pkgdict = dict()
- for package, filename, size, function in rows:
+ for package, _, filename, size, function in rows:
funcdict = pkgdict.setdefault(package, {})
funcdict.setdefault(function, []).append((size, filename))
return pkgdict
@@ -42,14 +42,17 @@ def main():
cur = db.cursor()
cur.execute("PRAGMA foreign_keys = ON;")
cur.execute("DELETE FROM sharing;")
+ cur.execute("DELETE FROM duplicate;")
readcur = db.cursor()
readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
for hashvalue, in fetchiter(readcur):
- cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
+ cur.execute("SELECT content.package, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
(hashvalue,))
rows = cur.fetchall()
print("processing hash %s with %d entries" % (hashvalue, len(rows)))
pkgdict = compute_pkgdict(rows)
+ cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
+ [(row[1],) for row in rows])
process_pkgdict(cur, pkgdict)
db.commit()