summaryrefslogtreecommitdiff
path: root/update_sharing.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-07-23 23:23:41 +0200
committerHelmut Grohne <helmut@subdivi.de>2013-07-23 23:23:41 +0200
commiteaba84e444c77495a5654b600c599646b8aa1aed (patch)
treeff6bc8bb15de0c3669e2a6a6ad159b39dd638594 /update_sharing.py
parent6206dea43941560a29c9a1105ae3055740ab80aa (diff)
downloaddebian-dedup-eaba84e444c77495a5654b600c599646b8aa1aed.tar.gz
schema: identify hash values by an integerhashid
This one is a bit more complex, than the other transformations, because the new hashvalue table has to be cleaned with a trigger. During a test import the -wal file exploded. The resulting db is similar in size to the original.
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-xupdate_sharing.py10
1 files changed, 5 insertions, 5 deletions
diff --git a/update_sharing.py b/update_sharing.py
index 55e8096..bca0213 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -44,12 +44,12 @@ def main():
cur.execute("DELETE FROM sharing;")
cur.execute("DELETE FROM duplicate;")
readcur = db.cursor()
- readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
- for hashvalue, in fetchiter(readcur):
- cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
- (hashvalue,))
+ readcur.execute("SELECT hid FROM hash GROUP BY hid HAVING count(*) > 1;")
+ for hid, in fetchiter(readcur):
+ cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hid = ?;",
+ (hid,))
rows = cur.fetchall()
- print("processing hash %s with %d entries" % (hashvalue, len(rows)))
+ print("processing hash %d with %d entries" % (hid, len(rows)))
pkgdict = compute_pkgdict(rows)
cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);",
[(row[1],) for row in rows])