diff options
Diffstat (limited to 'dedup')
-rw-r--r-- | dedup/compression.py | 10 | ||||
-rw-r--r-- | dedup/debpkg.py | 79 |
2 files changed, 84 insertions, 5 deletions
diff --git a/dedup/compression.py b/dedup/compression.py index 5df6613..7f6dc99 100644 --- a/dedup/compression.py +++ b/dedup/compression.py @@ -156,10 +156,10 @@ class DecompressedStream(object): self.closed = True decompressors = { - '.gz': GzipDecompressor, - '.bz2': bz2.BZ2Decompressor, - '.lzma': lzma.LZMADecompressor, - '.xz': lzma.LZMADecompressor, + u'.gz': GzipDecompressor, + u'.bz2': bz2.BZ2Decompressor, + u'.lzma': lzma.LZMADecompressor, + u'.xz': lzma.LZMADecompressor, } def decompress(filelike, extension): @@ -168,7 +168,7 @@ def decompress(filelike, extension): close(). @param extension: permitted values are "", ".gz", ".bz2", ".lzma", and ".xz" - @type extension: str + @type extension: unicode @returns: a read-only byte-stream with the decompressed contents of the original filelike. It supports read(size) and close(). If the original supports seek(pos) and tell(), then it also supports diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 04773de..ba0b7c9 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -1,6 +1,10 @@ +import sys +import tarfile + from debian import deb822 from dedup.arreader import ArReader +from dedup.compression import decompress from dedup.hashing import hash_file def process_control(control_contents): @@ -55,9 +59,37 @@ def get_tar_hashes(tar, hash_functions): hashes[hashobj.name] = hashvalue yield (elem.name, elem.size, hashes) +if sys.version_info.major >= 3: + def opentar(filelike): + return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8", + errors="surrogateescape") + + def decodetarname(name): + """Decoded name of a tarinfo. + @raises UnicodeDecodeError: + """ + try: + name.encode("utf8", "strict") + except UnicodeEncodeError as e: + if e.reason == "surrogates not allowed": + name.encode("utf8", "surrogateescape").decode("utf8", "strict") + return name +else: + def opentar(filelike): + return tarfile.open(fileobj=filelike, mode="r|") + + def decodetarname(name): + """Decoded name of a tarinfo. + @raises UnicodeDecodeError: + """ + return name.decode("utf8") + class DebExtractor(object): "Base class for extracting desired features from a Debian package." + def __init__(self): + self.arstate = "start" + def process(self, filelike): """Process a Debian package. @param filelike: is a file-like object containing the contents of the @@ -76,11 +108,58 @@ class DebExtractor(object): def handle_ar_member(self, name, filelike): """Handle an ar archive member of the Debian package. + If you replace this method, you must also replace handle_ar_end and + none of the methods handle_debversion, handle_control_tar or + handle_data_tar are called. @type name: bytes @param name: is the name of the member @param filelike: is a file-like object containing the contents of the member and can be read once without seeks. """ + if self.arstate == "start": + if name != b"debian-binary": + raise ValueError("debian-binary not found") + version = filelike.read() + self.handle_debversion(version) + if not version.startswith(b"2."): + raise ValueError("debian version not recognized") + self.arstate = "version" + elif self.arstate == "version": + if name.startswith(b"control.tar"): + filelike = decompress(filelike, name[11:].decode("ascii")) + self.handle_control_tar(opentar(filelike)) + self.arstate = "control" + elif not name.startswith(b"_"): + raise ValueError("unexpected ar member %r" % name) + elif self.arstate == "control": + if name.startswith(b"data.tar"): + filelike = decompress(filelike, name[8:].decode("ascii")) + self.handle_data_tar(opentar(filelike)) + self.arstate = "data" + elif not name.startswith(b"_"): + raise ValueError("unexpected ar member %r" % name) + else: + assert self.arstate == "data" def handle_ar_end(self): "Handle the end of the ar archive of the Debian package." + if self.arstate != "data": + raise ValueError("data.tar not found") + + def handle_debversion(self, version): + """Handle the debian-binary member of the Debian package. + @type version: bytes + @param version: The full contents of the ar member. + """ + + def handle_control_tar(self, tarfileobj): + """Handle the control.tar member of the Debian package. + @type tarfileobj: tarfile.TarFile + @param tarfile: is opened for streaming reads + """ + + def handle_data_tar(self, tarfileobj): + """Handle the data.tar member of the Debian package. + @type tarfileobj: tarfile.TarFile + @param tarfile: is opened for streaming reads + """ |