diff options
author | Helmut Grohne <helmut@subdivi.de> | 2016-05-01 14:31:56 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2016-05-01 14:31:56 +0200 |
commit | e6bc38edc9d0b9c8a34971ae8bcf0f0d53607f5e (patch) | |
tree | 9b54b6992e466792380f95122415c277f184cbc1 | |
parent | 2aef917cabba4e660f2525a551368532dbc7c195 (diff) | |
download | debian-dedup-e6bc38edc9d0b9c8a34971ae8bcf0f0d53607f5e.tar.gz |
push more functionality into DebExtractor
The handle_ar_member and handle_ar_end methods now have a default
implementation adding further handlers handle_debversion,
handle_control_tar and handle_data_tar.
In that process two additional bugs were fixed:
* decompress_tar was wrongly passing errors="surrogateescape" for
Python 2.x even though that's only supported for Python 3.x.
* The use of decompress actually passes the extension as unicode.
-rw-r--r-- | dedup/compression.py | 10 | ||||
-rw-r--r-- | dedup/debpkg.py | 79 | ||||
-rwxr-xr-x | importpkg.py | 84 |
3 files changed, 106 insertions, 67 deletions
diff --git a/dedup/compression.py b/dedup/compression.py index 5df6613..7f6dc99 100644 --- a/dedup/compression.py +++ b/dedup/compression.py @@ -156,10 +156,10 @@ class DecompressedStream(object): self.closed = True decompressors = { - '.gz': GzipDecompressor, - '.bz2': bz2.BZ2Decompressor, - '.lzma': lzma.LZMADecompressor, - '.xz': lzma.LZMADecompressor, + u'.gz': GzipDecompressor, + u'.bz2': bz2.BZ2Decompressor, + u'.lzma': lzma.LZMADecompressor, + u'.xz': lzma.LZMADecompressor, } def decompress(filelike, extension): @@ -168,7 +168,7 @@ def decompress(filelike, extension): close(). @param extension: permitted values are "", ".gz", ".bz2", ".lzma", and ".xz" - @type extension: str + @type extension: unicode @returns: a read-only byte-stream with the decompressed contents of the original filelike. It supports read(size) and close(). If the original supports seek(pos) and tell(), then it also supports diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 04773de..ba0b7c9 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -1,6 +1,10 @@ +import sys +import tarfile + from debian import deb822 from dedup.arreader import ArReader +from dedup.compression import decompress from dedup.hashing import hash_file def process_control(control_contents): @@ -55,9 +59,37 @@ def get_tar_hashes(tar, hash_functions): hashes[hashobj.name] = hashvalue yield (elem.name, elem.size, hashes) +if sys.version_info.major >= 3: + def opentar(filelike): + return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8", + errors="surrogateescape") + + def decodetarname(name): + """Decoded name of a tarinfo. + @raises UnicodeDecodeError: + """ + try: + name.encode("utf8", "strict") + except UnicodeEncodeError as e: + if e.reason == "surrogates not allowed": + name.encode("utf8", "surrogateescape").decode("utf8", "strict") + return name +else: + def opentar(filelike): + return tarfile.open(fileobj=filelike, mode="r|") + + def decodetarname(name): + """Decoded name of a tarinfo. + @raises UnicodeDecodeError: + """ + return name.decode("utf8") + class DebExtractor(object): "Base class for extracting desired features from a Debian package." + def __init__(self): + self.arstate = "start" + def process(self, filelike): """Process a Debian package. @param filelike: is a file-like object containing the contents of the @@ -76,11 +108,58 @@ class DebExtractor(object): def handle_ar_member(self, name, filelike): """Handle an ar archive member of the Debian package. + If you replace this method, you must also replace handle_ar_end and + none of the methods handle_debversion, handle_control_tar or + handle_data_tar are called. @type name: bytes @param name: is the name of the member @param filelike: is a file-like object containing the contents of the member and can be read once without seeks. """ + if self.arstate == "start": + if name != b"debian-binary": + raise ValueError("debian-binary not found") + version = filelike.read() + self.handle_debversion(version) + if not version.startswith(b"2."): + raise ValueError("debian version not recognized") + self.arstate = "version" + elif self.arstate == "version": + if name.startswith(b"control.tar"): + filelike = decompress(filelike, name[11:].decode("ascii")) + self.handle_control_tar(opentar(filelike)) + self.arstate = "control" + elif not name.startswith(b"_"): + raise ValueError("unexpected ar member %r" % name) + elif self.arstate == "control": + if name.startswith(b"data.tar"): + filelike = decompress(filelike, name[8:].decode("ascii")) + self.handle_data_tar(opentar(filelike)) + self.arstate = "data" + elif not name.startswith(b"_"): + raise ValueError("unexpected ar member %r" % name) + else: + assert self.arstate == "data" def handle_ar_end(self): "Handle the end of the ar archive of the Debian package." + if self.arstate != "data": + raise ValueError("data.tar not found") + + def handle_debversion(self, version): + """Handle the debian-binary member of the Debian package. + @type version: bytes + @param version: The full contents of the ar member. + """ + + def handle_control_tar(self, tarfileobj): + """Handle the control.tar member of the Debian package. + @type tarfileobj: tarfile.TarFile + @param tarfile: is opened for streaming reads + """ + + def handle_data_tar(self, tarfileobj): + """Handle the data.tar member of the Debian package. + @type tarfileobj: tarfile.TarFile + @param tarfile: is opened for streaming reads + """ diff --git a/importpkg.py b/importpkg.py index e8cc2fa..933ec80 100755 --- a/importpkg.py +++ b/importpkg.py @@ -8,15 +8,15 @@ And finally a document consisting of the string "commit" is emitted.""" import hashlib import optparse import sys -import tarfile import zlib import yaml -from dedup.debpkg import DebExtractor, process_control, get_tar_hashes +from dedup.debpkg import DebExtractor, decodetarname, get_tar_hashes, \ + process_control from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ HashBlacklistContent -from dedup.compression import GzipDecompressor, decompress +from dedup.compression import GzipDecompressor from dedup.image import GIFHash, PNGHash boring_content = set(("", "\n")) @@ -42,33 +42,6 @@ def gifhash(): hashobj.name = "gif_sha512" return hashobj -if sys.version_info.major >= 3: - def decompress_tar(filelike, extension): - filelike = decompress(filelike, extension.decode("ascii")) - return tarfile.open(fileobj=filelike, mode="r|") - - def decodetarname(name): - """Decoded name of a tarinfo. - @raises UnicodeDecodeError: - """ - try: - name.encode("utf8", "strict") - except UnicodeEncodeError as e: - if e.reason == "surrogates not allowed": - name.encode("utf8", "surrogateescape").decode("utf8", "strict") - return name -else: - def decompress_tar(filelike, extension): - filelike = decompress(filelike, extension.decode("ascii")) - return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8", - errors="surrogateescape") - - def decodetarname(name): - """Decoded name of a tarinfo. - @raises UnicodeDecodeError: - """ - return name.decode("utf8") - class ProcessingFinished(Exception): pass @@ -76,40 +49,27 @@ class ImportpkgExtractor(DebExtractor): hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash] def __init__(self, callback): - self.state = "start" + DebExtractor.__init__(self) self.callback = callback - def handle_ar_member(self, name, filelike): - if name.startswith(b"control.tar"): - if self.state != "start": - raise ValueError("unexpected control.tar") - self.state = "control" - tf = decompress_tar(filelike, name[11:]) - for elem in tf: - if elem.name not in ("./control", "control"): - continue - if self.state != "control": - raise ValueError("duplicate control file") - self.state = "control_file" - self.callback(process_control(tf.extractfile(elem).read())) - break - elif name.startswith(b"data.tar"): - if self.state != "control_file": - raise ValueError("missing control file") - self.state = "data" - tf = decompress_tar(filelike, name[8:]) - for name, size, hashes in get_tar_hashes(tf, self.hash_functions): - try: - name = decodetarname(name) - except UnicodeDecodeError: - print("warning: skipping filename with encoding error") - continue # skip files with non-utf8 encoding for now - self.callback(dict(name=name, size=size, hashes=hashes)) - raise ProcessingFinished() - - def handle_ar_end(self): - if self.state != "data": - raise ValueError("data.tar not found") + def handle_control_tar(self, tarfileobj): + for elem in tarfileobj: + if elem.name not in ("./control", "control"): + continue + self.callback(process_control(tarfileobj.extractfile(elem).read())) + return + raise ValueError("missing control file") + + def handle_data_tar(self, tarfileobj): + for name, size, hashes in get_tar_hashes(tarfileobj, + self.hash_functions): + try: + name = decodetarname(name) + except UnicodeDecodeError: + print("warning: skipping filename with encoding error") + continue # skip files with non-utf8 encoding for now + self.callback(dict(name=name, size=size, hashes=hashes)) + raise ProcessingFinished() def main(): parser = optparse.OptionParser() |