summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2016-04-28 21:35:42 +0200
committerHelmut Grohne <helmut@subdivi.de>2016-04-28 21:35:42 +0200
commit8e326cadab47896f77666b0a4c7f434cdefc83da (patch)
tree7691ac50e247be2c1db9e70f4379cb3165d9fef5
parent9ad30297a535a8a8f9bdc945288b02d75a379cb6 (diff)
downloaddebian-dedup-8e326cadab47896f77666b0a4c7f434cdefc83da.tar.gz
support Python 3.x in importpkg
In Python 2.x, TarInfo.name is a bytes object. In Python 3.x, TarInfo.name always is a unicode object. To avoid importpkg crashing with an exception, we direct the Python 3.x decoding to use surrogateescapes. Thus decoding the name boils down to checking whether it contains surrogates.
-rwxr-xr-ximportpkg.py31
1 files changed, 27 insertions, 4 deletions
diff --git a/importpkg.py b/importpkg.py
index dac4bb1..e8cc2fa 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -42,9 +42,32 @@ def gifhash():
hashobj.name = "gif_sha512"
return hashobj
-def decompress_tar(filelike, extension):
- filelike = decompress(filelike, extension.decode("ascii"))
- return tarfile.open(fileobj=filelike, mode="r|")
+if sys.version_info.major >= 3:
+ def decompress_tar(filelike, extension):
+ filelike = decompress(filelike, extension.decode("ascii"))
+ return tarfile.open(fileobj=filelike, mode="r|")
+
+ def decodetarname(name):
+ """Decoded name of a tarinfo.
+ @raises UnicodeDecodeError:
+ """
+ try:
+ name.encode("utf8", "strict")
+ except UnicodeEncodeError as e:
+ if e.reason == "surrogates not allowed":
+ name.encode("utf8", "surrogateescape").decode("utf8", "strict")
+ return name
+else:
+ def decompress_tar(filelike, extension):
+ filelike = decompress(filelike, extension.decode("ascii"))
+ return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
+ errors="surrogateescape")
+
+ def decodetarname(name):
+ """Decoded name of a tarinfo.
+ @raises UnicodeDecodeError:
+ """
+ return name.decode("utf8")
class ProcessingFinished(Exception):
pass
@@ -77,7 +100,7 @@ class ImportpkgExtractor(DebExtractor):
tf = decompress_tar(filelike, name[8:])
for name, size, hashes in get_tar_hashes(tf, self.hash_functions):
try:
- name = name.decode("utf8")
+ name = decodetarname(name)
except UnicodeDecodeError:
print("warning: skipping filename with encoding error")
continue # skip files with non-utf8 encoding for now