diff options
author | Helmut Grohne <helmut@subdivi.de> | 2016-05-01 14:31:56 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2016-05-01 14:31:56 +0200 |
commit | e6bc38edc9d0b9c8a34971ae8bcf0f0d53607f5e (patch) | |
tree | 9b54b6992e466792380f95122415c277f184cbc1 /dedup/debpkg.py | |
parent | 2aef917cabba4e660f2525a551368532dbc7c195 (diff) | |
download | debian-dedup-e6bc38edc9d0b9c8a34971ae8bcf0f0d53607f5e.tar.gz |
push more functionality into DebExtractor
The handle_ar_member and handle_ar_end methods now have a default
implementation adding further handlers handle_debversion,
handle_control_tar and handle_data_tar.
In that process two additional bugs were fixed:
* decompress_tar was wrongly passing errors="surrogateescape" for
Python 2.x even though that's only supported for Python 3.x.
* The use of decompress actually passes the extension as unicode.
Diffstat (limited to 'dedup/debpkg.py')
-rw-r--r-- | dedup/debpkg.py | 79 |
1 files changed, 79 insertions, 0 deletions
diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 04773de..ba0b7c9 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -1,6 +1,10 @@ +import sys +import tarfile + from debian import deb822 from dedup.arreader import ArReader +from dedup.compression import decompress from dedup.hashing import hash_file def process_control(control_contents): @@ -55,9 +59,37 @@ def get_tar_hashes(tar, hash_functions): hashes[hashobj.name] = hashvalue yield (elem.name, elem.size, hashes) +if sys.version_info.major >= 3: + def opentar(filelike): + return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8", + errors="surrogateescape") + + def decodetarname(name): + """Decoded name of a tarinfo. + @raises UnicodeDecodeError: + """ + try: + name.encode("utf8", "strict") + except UnicodeEncodeError as e: + if e.reason == "surrogates not allowed": + name.encode("utf8", "surrogateescape").decode("utf8", "strict") + return name +else: + def opentar(filelike): + return tarfile.open(fileobj=filelike, mode="r|") + + def decodetarname(name): + """Decoded name of a tarinfo. + @raises UnicodeDecodeError: + """ + return name.decode("utf8") + class DebExtractor(object): "Base class for extracting desired features from a Debian package." + def __init__(self): + self.arstate = "start" + def process(self, filelike): """Process a Debian package. @param filelike: is a file-like object containing the contents of the @@ -76,11 +108,58 @@ class DebExtractor(object): def handle_ar_member(self, name, filelike): """Handle an ar archive member of the Debian package. + If you replace this method, you must also replace handle_ar_end and + none of the methods handle_debversion, handle_control_tar or + handle_data_tar are called. @type name: bytes @param name: is the name of the member @param filelike: is a file-like object containing the contents of the member and can be read once without seeks. """ + if self.arstate == "start": + if name != b"debian-binary": + raise ValueError("debian-binary not found") + version = filelike.read() + self.handle_debversion(version) + if not version.startswith(b"2."): + raise ValueError("debian version not recognized") + self.arstate = "version" + elif self.arstate == "version": + if name.startswith(b"control.tar"): + filelike = decompress(filelike, name[11:].decode("ascii")) + self.handle_control_tar(opentar(filelike)) + self.arstate = "control" + elif not name.startswith(b"_"): + raise ValueError("unexpected ar member %r" % name) + elif self.arstate == "control": + if name.startswith(b"data.tar"): + filelike = decompress(filelike, name[8:].decode("ascii")) + self.handle_data_tar(opentar(filelike)) + self.arstate = "data" + elif not name.startswith(b"_"): + raise ValueError("unexpected ar member %r" % name) + else: + assert self.arstate == "data" def handle_ar_end(self): "Handle the end of the ar archive of the Debian package." + if self.arstate != "data": + raise ValueError("data.tar not found") + + def handle_debversion(self, version): + """Handle the debian-binary member of the Debian package. + @type version: bytes + @param version: The full contents of the ar member. + """ + + def handle_control_tar(self, tarfileobj): + """Handle the control.tar member of the Debian package. + @type tarfileobj: tarfile.TarFile + @param tarfile: is opened for streaming reads + """ + + def handle_data_tar(self, tarfileobj): + """Handle the data.tar member of the Debian package. + @type tarfileobj: tarfile.TarFile + @param tarfile: is opened for streaming reads + """ |