summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2016-05-01 14:31:56 +0200
committerHelmut Grohne <helmut@subdivi.de>2016-05-01 14:31:56 +0200
commite6bc38edc9d0b9c8a34971ae8bcf0f0d53607f5e (patch)
tree9b54b6992e466792380f95122415c277f184cbc1
parent2aef917cabba4e660f2525a551368532dbc7c195 (diff)
downloaddebian-dedup-e6bc38edc9d0b9c8a34971ae8bcf0f0d53607f5e.tar.gz
push more functionality into DebExtractor
The handle_ar_member and handle_ar_end methods now have a default implementation adding further handlers handle_debversion, handle_control_tar and handle_data_tar. In that process two additional bugs were fixed: * decompress_tar was wrongly passing errors="surrogateescape" for Python 2.x even though that's only supported for Python 3.x. * The use of decompress actually passes the extension as unicode.
-rw-r--r--dedup/compression.py10
-rw-r--r--dedup/debpkg.py79
-rwxr-xr-ximportpkg.py84
3 files changed, 106 insertions, 67 deletions
diff --git a/dedup/compression.py b/dedup/compression.py
index 5df6613..7f6dc99 100644
--- a/dedup/compression.py
+++ b/dedup/compression.py
@@ -156,10 +156,10 @@ class DecompressedStream(object):
self.closed = True
decompressors = {
- '.gz': GzipDecompressor,
- '.bz2': bz2.BZ2Decompressor,
- '.lzma': lzma.LZMADecompressor,
- '.xz': lzma.LZMADecompressor,
+ u'.gz': GzipDecompressor,
+ u'.bz2': bz2.BZ2Decompressor,
+ u'.lzma': lzma.LZMADecompressor,
+ u'.xz': lzma.LZMADecompressor,
}
def decompress(filelike, extension):
@@ -168,7 +168,7 @@ def decompress(filelike, extension):
close().
@param extension: permitted values are "", ".gz", ".bz2", ".lzma", and
".xz"
- @type extension: str
+ @type extension: unicode
@returns: a read-only byte-stream with the decompressed contents of the
original filelike. It supports read(size) and close(). If the
original supports seek(pos) and tell(), then it also supports
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index 04773de..ba0b7c9 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -1,6 +1,10 @@
+import sys
+import tarfile
+
from debian import deb822
from dedup.arreader import ArReader
+from dedup.compression import decompress
from dedup.hashing import hash_file
def process_control(control_contents):
@@ -55,9 +59,37 @@ def get_tar_hashes(tar, hash_functions):
hashes[hashobj.name] = hashvalue
yield (elem.name, elem.size, hashes)
+if sys.version_info.major >= 3:
+ def opentar(filelike):
+ return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
+ errors="surrogateescape")
+
+ def decodetarname(name):
+ """Decoded name of a tarinfo.
+ @raises UnicodeDecodeError:
+ """
+ try:
+ name.encode("utf8", "strict")
+ except UnicodeEncodeError as e:
+ if e.reason == "surrogates not allowed":
+ name.encode("utf8", "surrogateescape").decode("utf8", "strict")
+ return name
+else:
+ def opentar(filelike):
+ return tarfile.open(fileobj=filelike, mode="r|")
+
+ def decodetarname(name):
+ """Decoded name of a tarinfo.
+ @raises UnicodeDecodeError:
+ """
+ return name.decode("utf8")
+
class DebExtractor(object):
"Base class for extracting desired features from a Debian package."
+ def __init__(self):
+ self.arstate = "start"
+
def process(self, filelike):
"""Process a Debian package.
@param filelike: is a file-like object containing the contents of the
@@ -76,11 +108,58 @@ class DebExtractor(object):
def handle_ar_member(self, name, filelike):
"""Handle an ar archive member of the Debian package.
+ If you replace this method, you must also replace handle_ar_end and
+ none of the methods handle_debversion, handle_control_tar or
+ handle_data_tar are called.
@type name: bytes
@param name: is the name of the member
@param filelike: is a file-like object containing the contents of the
member and can be read once without seeks.
"""
+ if self.arstate == "start":
+ if name != b"debian-binary":
+ raise ValueError("debian-binary not found")
+ version = filelike.read()
+ self.handle_debversion(version)
+ if not version.startswith(b"2."):
+ raise ValueError("debian version not recognized")
+ self.arstate = "version"
+ elif self.arstate == "version":
+ if name.startswith(b"control.tar"):
+ filelike = decompress(filelike, name[11:].decode("ascii"))
+ self.handle_control_tar(opentar(filelike))
+ self.arstate = "control"
+ elif not name.startswith(b"_"):
+ raise ValueError("unexpected ar member %r" % name)
+ elif self.arstate == "control":
+ if name.startswith(b"data.tar"):
+ filelike = decompress(filelike, name[8:].decode("ascii"))
+ self.handle_data_tar(opentar(filelike))
+ self.arstate = "data"
+ elif not name.startswith(b"_"):
+ raise ValueError("unexpected ar member %r" % name)
+ else:
+ assert self.arstate == "data"
def handle_ar_end(self):
"Handle the end of the ar archive of the Debian package."
+ if self.arstate != "data":
+ raise ValueError("data.tar not found")
+
+ def handle_debversion(self, version):
+ """Handle the debian-binary member of the Debian package.
+ @type version: bytes
+ @param version: The full contents of the ar member.
+ """
+
+ def handle_control_tar(self, tarfileobj):
+ """Handle the control.tar member of the Debian package.
+ @type tarfileobj: tarfile.TarFile
+ @param tarfile: is opened for streaming reads
+ """
+
+ def handle_data_tar(self, tarfileobj):
+ """Handle the data.tar member of the Debian package.
+ @type tarfileobj: tarfile.TarFile
+ @param tarfile: is opened for streaming reads
+ """
diff --git a/importpkg.py b/importpkg.py
index e8cc2fa..933ec80 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -8,15 +8,15 @@ And finally a document consisting of the string "commit" is emitted."""
import hashlib
import optparse
import sys
-import tarfile
import zlib
import yaml
-from dedup.debpkg import DebExtractor, process_control, get_tar_hashes
+from dedup.debpkg import DebExtractor, decodetarname, get_tar_hashes, \
+ process_control
from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
HashBlacklistContent
-from dedup.compression import GzipDecompressor, decompress
+from dedup.compression import GzipDecompressor
from dedup.image import GIFHash, PNGHash
boring_content = set(("", "\n"))
@@ -42,33 +42,6 @@ def gifhash():
hashobj.name = "gif_sha512"
return hashobj
-if sys.version_info.major >= 3:
- def decompress_tar(filelike, extension):
- filelike = decompress(filelike, extension.decode("ascii"))
- return tarfile.open(fileobj=filelike, mode="r|")
-
- def decodetarname(name):
- """Decoded name of a tarinfo.
- @raises UnicodeDecodeError:
- """
- try:
- name.encode("utf8", "strict")
- except UnicodeEncodeError as e:
- if e.reason == "surrogates not allowed":
- name.encode("utf8", "surrogateescape").decode("utf8", "strict")
- return name
-else:
- def decompress_tar(filelike, extension):
- filelike = decompress(filelike, extension.decode("ascii"))
- return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
- errors="surrogateescape")
-
- def decodetarname(name):
- """Decoded name of a tarinfo.
- @raises UnicodeDecodeError:
- """
- return name.decode("utf8")
-
class ProcessingFinished(Exception):
pass
@@ -76,40 +49,27 @@ class ImportpkgExtractor(DebExtractor):
hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
def __init__(self, callback):
- self.state = "start"
+ DebExtractor.__init__(self)
self.callback = callback
- def handle_ar_member(self, name, filelike):
- if name.startswith(b"control.tar"):
- if self.state != "start":
- raise ValueError("unexpected control.tar")
- self.state = "control"
- tf = decompress_tar(filelike, name[11:])
- for elem in tf:
- if elem.name not in ("./control", "control"):
- continue
- if self.state != "control":
- raise ValueError("duplicate control file")
- self.state = "control_file"
- self.callback(process_control(tf.extractfile(elem).read()))
- break
- elif name.startswith(b"data.tar"):
- if self.state != "control_file":
- raise ValueError("missing control file")
- self.state = "data"
- tf = decompress_tar(filelike, name[8:])
- for name, size, hashes in get_tar_hashes(tf, self.hash_functions):
- try:
- name = decodetarname(name)
- except UnicodeDecodeError:
- print("warning: skipping filename with encoding error")
- continue # skip files with non-utf8 encoding for now
- self.callback(dict(name=name, size=size, hashes=hashes))
- raise ProcessingFinished()
-
- def handle_ar_end(self):
- if self.state != "data":
- raise ValueError("data.tar not found")
+ def handle_control_tar(self, tarfileobj):
+ for elem in tarfileobj:
+ if elem.name not in ("./control", "control"):
+ continue
+ self.callback(process_control(tarfileobj.extractfile(elem).read()))
+ return
+ raise ValueError("missing control file")
+
+ def handle_data_tar(self, tarfileobj):
+ for name, size, hashes in get_tar_hashes(tarfileobj,
+ self.hash_functions):
+ try:
+ name = decodetarname(name)
+ except UnicodeDecodeError:
+ print("warning: skipping filename with encoding error")
+ continue # skip files with non-utf8 encoding for now
+ self.callback(dict(name=name, size=size, hashes=hashes))
+ raise ProcessingFinished()
def main():
parser = optparse.OptionParser()