From e6bc38edc9d0b9c8a34971ae8bcf0f0d53607f5e Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 1 May 2016 14:31:56 +0200 Subject: push more functionality into DebExtractor The handle_ar_member and handle_ar_end methods now have a default implementation adding further handlers handle_debversion, handle_control_tar and handle_data_tar. In that process two additional bugs were fixed: * decompress_tar was wrongly passing errors="surrogateescape" for Python 2.x even though that's only supported for Python 3.x. * The use of decompress actually passes the extension as unicode. --- dedup/debpkg.py | 79 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 79 insertions(+) (limited to 'dedup/debpkg.py') diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 04773de..ba0b7c9 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -1,6 +1,10 @@ +import sys +import tarfile + from debian import deb822 from dedup.arreader import ArReader +from dedup.compression import decompress from dedup.hashing import hash_file def process_control(control_contents): @@ -55,9 +59,37 @@ def get_tar_hashes(tar, hash_functions): hashes[hashobj.name] = hashvalue yield (elem.name, elem.size, hashes) +if sys.version_info.major >= 3: + def opentar(filelike): + return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8", + errors="surrogateescape") + + def decodetarname(name): + """Decoded name of a tarinfo. + @raises UnicodeDecodeError: + """ + try: + name.encode("utf8", "strict") + except UnicodeEncodeError as e: + if e.reason == "surrogates not allowed": + name.encode("utf8", "surrogateescape").decode("utf8", "strict") + return name +else: + def opentar(filelike): + return tarfile.open(fileobj=filelike, mode="r|") + + def decodetarname(name): + """Decoded name of a tarinfo. + @raises UnicodeDecodeError: + """ + return name.decode("utf8") + class DebExtractor(object): "Base class for extracting desired features from a Debian package." + def __init__(self): + self.arstate = "start" + def process(self, filelike): """Process a Debian package. @param filelike: is a file-like object containing the contents of the @@ -76,11 +108,58 @@ class DebExtractor(object): def handle_ar_member(self, name, filelike): """Handle an ar archive member of the Debian package. + If you replace this method, you must also replace handle_ar_end and + none of the methods handle_debversion, handle_control_tar or + handle_data_tar are called. @type name: bytes @param name: is the name of the member @param filelike: is a file-like object containing the contents of the member and can be read once without seeks. """ + if self.arstate == "start": + if name != b"debian-binary": + raise ValueError("debian-binary not found") + version = filelike.read() + self.handle_debversion(version) + if not version.startswith(b"2."): + raise ValueError("debian version not recognized") + self.arstate = "version" + elif self.arstate == "version": + if name.startswith(b"control.tar"): + filelike = decompress(filelike, name[11:].decode("ascii")) + self.handle_control_tar(opentar(filelike)) + self.arstate = "control" + elif not name.startswith(b"_"): + raise ValueError("unexpected ar member %r" % name) + elif self.arstate == "control": + if name.startswith(b"data.tar"): + filelike = decompress(filelike, name[8:].decode("ascii")) + self.handle_data_tar(opentar(filelike)) + self.arstate = "data" + elif not name.startswith(b"_"): + raise ValueError("unexpected ar member %r" % name) + else: + assert self.arstate == "data" def handle_ar_end(self): "Handle the end of the ar archive of the Debian package." + if self.arstate != "data": + raise ValueError("data.tar not found") + + def handle_debversion(self, version): + """Handle the debian-binary member of the Debian package. + @type version: bytes + @param version: The full contents of the ar member. + """ + + def handle_control_tar(self, tarfileobj): + """Handle the control.tar member of the Debian package. + @type tarfileobj: tarfile.TarFile + @param tarfile: is opened for streaming reads + """ + + def handle_data_tar(self, tarfileobj): + """Handle the data.tar member of the Debian package. + @type tarfileobj: tarfile.TarFile + @param tarfile: is opened for streaming reads + """ -- cgit v1.2.3