From 8e326cadab47896f77666b0a4c7f434cdefc83da Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 28 Apr 2016 21:35:42 +0200 Subject: support Python 3.x in importpkg In Python 2.x, TarInfo.name is a bytes object. In Python 3.x, TarInfo.name always is a unicode object. To avoid importpkg crashing with an exception, we direct the Python 3.x decoding to use surrogateescapes. Thus decoding the name boils down to checking whether it contains surrogates. --- importpkg.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/importpkg.py b/importpkg.py index dac4bb1..e8cc2fa 100755 --- a/importpkg.py +++ b/importpkg.py @@ -42,9 +42,32 @@ def gifhash(): hashobj.name = "gif_sha512" return hashobj -def decompress_tar(filelike, extension): - filelike = decompress(filelike, extension.decode("ascii")) - return tarfile.open(fileobj=filelike, mode="r|") +if sys.version_info.major >= 3: + def decompress_tar(filelike, extension): + filelike = decompress(filelike, extension.decode("ascii")) + return tarfile.open(fileobj=filelike, mode="r|") + + def decodetarname(name): + """Decoded name of a tarinfo. + @raises UnicodeDecodeError: + """ + try: + name.encode("utf8", "strict") + except UnicodeEncodeError as e: + if e.reason == "surrogates not allowed": + name.encode("utf8", "surrogateescape").decode("utf8", "strict") + return name +else: + def decompress_tar(filelike, extension): + filelike = decompress(filelike, extension.decode("ascii")) + return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8", + errors="surrogateescape") + + def decodetarname(name): + """Decoded name of a tarinfo. + @raises UnicodeDecodeError: + """ + return name.decode("utf8") class ProcessingFinished(Exception): pass @@ -77,7 +100,7 @@ class ImportpkgExtractor(DebExtractor): tf = decompress_tar(filelike, name[8:]) for name, size, hashes in get_tar_hashes(tf, self.hash_functions): try: - name = name.decode("utf8") + name = decodetarname(name) except UnicodeDecodeError: print("warning: skipping filename with encoding error") continue # skip files with non-utf8 encoding for now -- cgit v1.2.3