summaryrefslogtreecommitdiff
path: root/update_sharing.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-07-23 18:53:55 +0200
committerHelmut Grohne <helmut@subdivi.de>2013-07-23 18:53:55 +0200
commit6f88561d726327c90f83b8aad1db26abbd4cdf1e (patch)
tree49d04fed10475183190cbe9ce536947958e9a749 /update_sharing.py
parent6206dea43941560a29c9a1105ae3055740ab80aa (diff)
downloaddebian-dedup-6f88561d726327c90f83b8aad1db26abbd4cdf1e.tar.gz
schema: reference hash functions by integer key
This already worked quite well for package.id. On a test data set of 5% size this transformation reduces the database size by about 4%.
Diffstat (limited to 'update_sharing.py')
-rwxr-xr-xupdate_sharing.py16
1 files changed, 8 insertions, 8 deletions
diff --git a/update_sharing.py b/update_sharing.py
index 55e8096..e1a2d68 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -5,23 +5,23 @@ import sqlite3
from dedup.utils import fetchiter
def add_values(cursor, insert_key, files, size):
- cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND func1 = ? AND func2 = ?;",
+ cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND fid1 = ? AND fid2 = ?;",
(files, size) + insert_key)
if cursor.rowcount > 0:
return
- cursor.execute("INSERT INTO sharing (pid1, pid2, func1, func2, files, size) VALUES (?, ?, ?, ?, ?, ?);",
+ cursor.execute("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (?, ?, ?, ?, ?, ?);",
insert_key + (files, size))
def compute_pkgdict(rows):
pkgdict = dict()
- for pid, _, filename, size, function in rows:
+ for pid, _, filename, size, fid in rows:
funcdict = pkgdict.setdefault(pid, {})
- funcdict.setdefault(function, []).append((size, filename))
+ funcdict.setdefault(fid, []).append((size, filename))
return pkgdict
def process_pkgdict(cursor, pkgdict):
for pid1, funcdict1 in pkgdict.items():
- for function1, files in funcdict1.items():
+ for fid1, files in funcdict1.items():
numfiles = len(files)
size = sum(entry[0] for entry in files)
for pid2, funcdict2 in pkgdict.items():
@@ -33,8 +33,8 @@ def process_pkgdict(cursor, pkgdict):
else:
pkgnumfiles = numfiles
pkgsize = size
- for function2 in funcdict2.keys():
- insert_key = (pid1, pid2, function1, function2)
+ for fid2 in funcdict2.keys():
+ insert_key = (pid1, pid2, fid1, fid2)
add_values(cursor, insert_key, pkgnumfiles, pkgsize)
def main():
@@ -46,7 +46,7 @@ def main():
readcur = db.cursor()
readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;")
for hashvalue, in fetchiter(readcur):
- cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
+ cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;",
(hashvalue,))
rows = cur.fetchall()
print("processing hash %s with %d entries" % (hashvalue, len(rows)))