diff options
Diffstat (limited to 'importpkg.py')
-rwxr-xr-x | importpkg.py | 70 |
1 files changed, 33 insertions, 37 deletions
diff --git a/importpkg.py b/importpkg.py index 54f6181..06d9da4 100755 --- a/importpkg.py +++ b/importpkg.py @@ -1,7 +1,7 @@ #!/usr/bin/python -"""This tool reads a debian package from stdin and emits a yaml stream on +"""This tool reads a Debian package from stdin and emits a yaml stream on stdout. It does not access a database. Therefore it can be run in parallel and -on multiple machines. The generated yaml conatins multiple documents. The first +on multiple machines. The generated yaml contains multiple documents. The first document contains package metadata. Then a document is emitted for each file. And finally a document consisting of the string "commit" is emitted.""" @@ -16,26 +16,21 @@ import yaml from dedup.arreader import ArReader from dedup.debpkg import process_control, get_tar_hashes -from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ - HashedStream +from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ + HashBlacklistContent from dedup.compression import GzipDecompressor, DecompressedStream from dedup.image import GIFHash, PNGHash -boring_sha512_hashes = set(( - # "" - "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", - # "\n" - "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09")) +boring_content = set(("", "\n")) def sha512_nontrivial(): - return HashBlacklist(hashlib.sha512(), boring_sha512_hashes) + return HashBlacklistContent(hashlib.sha512(), boring_content) def gziphash(): hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) hashobj.name = "gzip_sha512" - # don't blacklist boring hashes for gzip to get gzip issues right - return hashobj + return HashBlacklistContent(hashobj, boring_content) def pnghash(): hashobj = PNGHash(hashlib.sha512()) @@ -49,6 +44,15 @@ def gifhash(): hashobj.name = "gif_sha512" return hashobj +def decompress_tar(filelike, extension): + if extension in (".lzma", ".xz"): + filelike = DecompressedStream(filelike, lzma.LZMADecompressor()) + extension = "" + if extension not in ("", ".gz", ".bz2"): + raise ValueError("unknown compression format with extension %r" % + extension) + return tarfile.open(fileobj=filelike, mode="r|" + extension[1:]) + def process_package(filelike, hash_functions): af = ArReader(filelike) af.read_magic() @@ -58,11 +62,11 @@ def process_package(filelike, hash_functions): name = af.read_entry() except EOFError: raise ValueError("data.tar not found") - if name == "control.tar.gz": + if name.startswith("control.tar"): if state != "start": - raise ValueError("unexpected control.tar.gz") + raise ValueError("unexpected control.tar") state = "control" - tf = tarfile.open(fileobj=af, mode="r|gz") + tf = decompress_tar(af, name[11:]) for elem in tf: if elem.name != "./control": continue @@ -72,28 +76,20 @@ def process_package(filelike, hash_functions): yield process_control(tf.extractfile(elem).read()) break continue - elif name == "data.tar.gz": - tf = tarfile.open(fileobj=af, mode="r|gz") - elif name == "data.tar.bz2": - tf = tarfile.open(fileobj=af, mode="r|bz2") - elif name == "data.tar.xz": - zf = DecompressedStream(af, lzma.LZMADecompressor()) - tf = tarfile.open(fileobj=zf, mode="r|") - elif name == "data.tar": - tf = tarfile.open(fileobj=af, mode="r|") - else: - continue - if state != "control_file": - raise ValueError("missing control file") - for name, size, hashes in get_tar_hashes(tf, hash_functions): - try: - name = name.decode("utf8") - except UnicodeDecodeError: - print("warning: skipping filename with encoding error") - continue # skip files with non-utf8 encoding for now - yield dict(name=name, size=size, hashes=hashes) - yield "commit" - break + elif name.startswith("data.tar"): + if state != "control_file": + raise ValueError("missing control file") + state = "data" + tf = decompress_tar(af, name[8:]) + for name, size, hashes in get_tar_hashes(tf, hash_functions): + try: + name = name.decode("utf8") + except UnicodeDecodeError: + print("warning: skipping filename with encoding error") + continue # skip files with non-utf8 encoding for now + yield dict(name=name, size=size, hashes=hashes) + yield "commit" + break def process_package_with_hash(filelike, hash_functions, sha256hash): hstream = HashedStream(filelike, hashlib.sha256()) |