summaryrefslogtreecommitdiff
path: root/dedup/debpkg.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2016-05-01 14:31:56 +0200
committerHelmut Grohne <helmut@subdivi.de>2016-05-01 14:31:56 +0200
commite6bc38edc9d0b9c8a34971ae8bcf0f0d53607f5e (patch)
tree9b54b6992e466792380f95122415c277f184cbc1 /dedup/debpkg.py
parent2aef917cabba4e660f2525a551368532dbc7c195 (diff)
downloaddebian-dedup-e6bc38edc9d0b9c8a34971ae8bcf0f0d53607f5e.tar.gz
push more functionality into DebExtractor
The handle_ar_member and handle_ar_end methods now have a default implementation adding further handlers handle_debversion, handle_control_tar and handle_data_tar. In that process two additional bugs were fixed: * decompress_tar was wrongly passing errors="surrogateescape" for Python 2.x even though that's only supported for Python 3.x. * The use of decompress actually passes the extension as unicode.
Diffstat (limited to 'dedup/debpkg.py')
-rw-r--r--dedup/debpkg.py79
1 files changed, 79 insertions, 0 deletions
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index 04773de..ba0b7c9 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -1,6 +1,10 @@
+import sys
+import tarfile
+
from debian import deb822
from dedup.arreader import ArReader
+from dedup.compression import decompress
from dedup.hashing import hash_file
def process_control(control_contents):
@@ -55,9 +59,37 @@ def get_tar_hashes(tar, hash_functions):
hashes[hashobj.name] = hashvalue
yield (elem.name, elem.size, hashes)
+if sys.version_info.major >= 3:
+ def opentar(filelike):
+ return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
+ errors="surrogateescape")
+
+ def decodetarname(name):
+ """Decoded name of a tarinfo.
+ @raises UnicodeDecodeError:
+ """
+ try:
+ name.encode("utf8", "strict")
+ except UnicodeEncodeError as e:
+ if e.reason == "surrogates not allowed":
+ name.encode("utf8", "surrogateescape").decode("utf8", "strict")
+ return name
+else:
+ def opentar(filelike):
+ return tarfile.open(fileobj=filelike, mode="r|")
+
+ def decodetarname(name):
+ """Decoded name of a tarinfo.
+ @raises UnicodeDecodeError:
+ """
+ return name.decode("utf8")
+
class DebExtractor(object):
"Base class for extracting desired features from a Debian package."
+ def __init__(self):
+ self.arstate = "start"
+
def process(self, filelike):
"""Process a Debian package.
@param filelike: is a file-like object containing the contents of the
@@ -76,11 +108,58 @@ class DebExtractor(object):
def handle_ar_member(self, name, filelike):
"""Handle an ar archive member of the Debian package.
+ If you replace this method, you must also replace handle_ar_end and
+ none of the methods handle_debversion, handle_control_tar or
+ handle_data_tar are called.
@type name: bytes
@param name: is the name of the member
@param filelike: is a file-like object containing the contents of the
member and can be read once without seeks.
"""
+ if self.arstate == "start":
+ if name != b"debian-binary":
+ raise ValueError("debian-binary not found")
+ version = filelike.read()
+ self.handle_debversion(version)
+ if not version.startswith(b"2."):
+ raise ValueError("debian version not recognized")
+ self.arstate = "version"
+ elif self.arstate == "version":
+ if name.startswith(b"control.tar"):
+ filelike = decompress(filelike, name[11:].decode("ascii"))
+ self.handle_control_tar(opentar(filelike))
+ self.arstate = "control"
+ elif not name.startswith(b"_"):
+ raise ValueError("unexpected ar member %r" % name)
+ elif self.arstate == "control":
+ if name.startswith(b"data.tar"):
+ filelike = decompress(filelike, name[8:].decode("ascii"))
+ self.handle_data_tar(opentar(filelike))
+ self.arstate = "data"
+ elif not name.startswith(b"_"):
+ raise ValueError("unexpected ar member %r" % name)
+ else:
+ assert self.arstate == "data"
def handle_ar_end(self):
"Handle the end of the ar archive of the Debian package."
+ if self.arstate != "data":
+ raise ValueError("data.tar not found")
+
+ def handle_debversion(self, version):
+ """Handle the debian-binary member of the Debian package.
+ @type version: bytes
+ @param version: The full contents of the ar member.
+ """
+
+ def handle_control_tar(self, tarfileobj):
+ """Handle the control.tar member of the Debian package.
+ @type tarfileobj: tarfile.TarFile
+ @param tarfile: is opened for streaming reads
+ """
+
+ def handle_data_tar(self, tarfileobj):
+ """Handle the data.tar member of the Debian package.
+ @type tarfileobj: tarfile.TarFile
+ @param tarfile: is opened for streaming reads
+ """