summaryrefslogtreecommitdiff
path: root/importpkg.py
diff options
context:
space:
mode:
Diffstat (limited to 'importpkg.py')
-rwxr-xr-ximportpkg.py16
1 files changed, 14 insertions, 2 deletions
diff --git a/importpkg.py b/importpkg.py
index 89020b9..d626fba 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -20,6 +20,7 @@ import lzma
from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
from dedup.compression import GzipDecompressor, DecompressedStream
+from dedup.image import ImageHash
class ArReader(object):
global_magic = b"!<arch>\n"
@@ -96,11 +97,17 @@ def gziphash():
hashobj.name = "gzip_sha512"
return HashBlacklist(hashobj, boring_sha512_hashes)
+def imagehash():
+ hashobj = ImageHash(hashlib.sha512())
+ hashobj = SuppressingHash(hashobj, (ValueError,))
+ hashobj.name = "image_sha512"
+ return hashobj
+
def get_hashes(tar):
for elem in tar:
if not elem.isreg(): # excludes hard links as well
continue
- hasher = MultiHash(sha512_nontrivial(), gziphash())
+ hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
hasher = hash_file(hasher, tar.extractfile(elem))
for hashobj in hasher.hashes:
hashvalue = hashobj.hexdigest()
@@ -167,8 +174,13 @@ def process_package(db, filelike):
if state != "control_file":
raise ValueError("missing control file")
for name, size, function, hexhash in get_hashes(tf):
+ try:
+ name = name.decode("utf8")
+ except UnicodeDecodeError:
+ print("warning: skipping filename with encoding error")
+ continue # skip files with non-utf8 encoding for now
cur.execute("INSERT INTO content (package, filename, size, function, hash) VALUES (?, ?, ?, ?, ?);",
- (package, name.decode("utf8"), size, function, hexhash))
+ (package, name, size, function, hexhash))
db.commit()
return
raise ValueError("data.tar not found")