summaryrefslogtreecommitdiff
path: root/importpkg.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-03-09 18:43:47 +0100
committerHelmut Grohne <helmut@subdivi.de>2013-03-09 18:43:47 +0100
commit5c0dcba3229b8c3e0faf42cf3e07cb82ee1369cd (patch)
tree08f4b4cd1aae470eb6bae21bd0a4859ecd91aee9 /importpkg.py
parent423ceee0d0befc8755a9ae915d15e8d415d98159 (diff)
downloaddebian-dedup-5c0dcba3229b8c3e0faf42cf3e07cb82ee1369cd.tar.gz
split content table to a hash table
In the old content table (package, filename, size) would be the same for multiple hash functions. Now the schema represents that each file has precisely one size, but multiple hashes.
Diffstat (limited to 'importpkg.py')
-rwxr-xr-ximportpkg.py13
1 files changed, 9 insertions, 4 deletions
diff --git a/importpkg.py b/importpkg.py
index aae9a7f..5d6a58c 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -102,10 +102,12 @@ def get_hashes(tar):
continue
hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
hasher = hash_file(hasher, tar.extractfile(elem))
+ hashes = {}
for hashobj in hasher.hashes:
hashvalue = hashobj.hexdigest()
if hashvalue:
- yield (elem.name, elem.size, hashobj.name, hashvalue)
+ hashes[hashobj.name] = hashvalue
+ yield (elem.name, elem.size, hashes)
def process_package(db, filelike):
cur = db.cursor()
@@ -169,14 +171,17 @@ def process_package(db, filelike):
continue
if state != "control_file":
raise ValueError("missing control file")
- for name, size, function, hexhash in get_hashes(tf):
+ for name, size, hashes in get_hashes(tf):
try:
name = name.decode("utf8")
except UnicodeDecodeError:
print("warning: skipping filename with encoding error")
continue # skip files with non-utf8 encoding for now
- cur.execute("INSERT INTO content (package, filename, size, function, hash) VALUES (?, ?, ?, ?, ?);",
- (package, name, size, function, hexhash))
+ cur.execute("INSERT INTO content (package, filename, size) VALUES (?, ?, ?);",
+ (package, name, size))
+ cid = cur.lastrowid
+ cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);",
+ ((cid, func, hexhash) for func, hexhash in hashes.items()))
db.commit()
return
raise ValueError("data.tar not found")