diff options
-rw-r--r-- | dedup/debpkg.py | 31 | ||||
-rwxr-xr-x | importpkg.py | 58 |
2 files changed, 64 insertions, 25 deletions
diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 8f2121b..04773de 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -1,5 +1,6 @@ from debian import deb822 +from dedup.arreader import ArReader from dedup.hashing import hash_file def process_control(control_contents): @@ -53,3 +54,33 @@ def get_tar_hashes(tar, hash_functions): if hashvalue: hashes[hashobj.name] = hashvalue yield (elem.name, elem.size, hashes) + +class DebExtractor(object): + "Base class for extracting desired features from a Debian package." + + def process(self, filelike): + """Process a Debian package. + @param filelike: is a file-like object containing the contents of the + Debian packge and can be read once without seeks. + """ + af = ArReader(filelike) + af.read_magic() + while True: + try: + name = af.read_entry() + except EOFError: + break + else: + self.handle_ar_member(name, af) + self.handle_ar_end() + + def handle_ar_member(self, name, filelike): + """Handle an ar archive member of the Debian package. + @type name: bytes + @param name: is the name of the member + @param filelike: is a file-like object containing the contents of the + member and can be read once without seeks. + """ + + def handle_ar_end(self): + "Handle the end of the ar archive of the Debian package." diff --git a/importpkg.py b/importpkg.py index f72cf03..0798f13 100755 --- a/importpkg.py +++ b/importpkg.py @@ -14,8 +14,7 @@ import zlib import lzma import yaml -from dedup.arreader import ArReader -from dedup.debpkg import process_control, get_tar_hashes +from dedup.debpkg import DebExtractor, process_control, get_tar_hashes from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ HashBlacklistContent from dedup.compression import GzipDecompressor, DecompressedStream @@ -54,42 +53,46 @@ def decompress_tar(filelike, extension): return tarfile.open(fileobj=filelike, mode="r|" + extension[1:].decode("ascii")) -def process_package(filelike, hash_functions, callback): - af = ArReader(filelike) - af.read_magic() - state = "start" - while True: - try: - name = af.read_entry() - except EOFError: - raise ValueError("data.tar not found") +class ProcessingFinished(Exception): + pass + +class ImportpkgExtractor(DebExtractor): + def __init__(self, hash_functions, callback): + self.state = "start" + self.hash_functions = hash_functions + self.callback = callback + + def handle_ar_member(self, name, filelike): if name.startswith(b"control.tar"): - if state != "start": + if self.state != "start": raise ValueError("unexpected control.tar") - state = "control" - tf = decompress_tar(af, name[11:]) + self.state = "control" + tf = decompress_tar(filelike, name[11:]) for elem in tf: if elem.name not in ("./control", "control"): continue - if state != "control": + if self.state != "control": raise ValueError("duplicate control file") - state = "control_file" - callback(process_control(tf.extractfile(elem).read())) + self.state = "control_file" + self.callback(process_control(tf.extractfile(elem).read())) break - continue elif name.startswith(b"data.tar"): - if state != "control_file": + if self.state != "control_file": raise ValueError("missing control file") - state = "data" - tf = decompress_tar(af, name[8:]) - for name, size, hashes in get_tar_hashes(tf, hash_functions): + self.state = "data" + tf = decompress_tar(filelike, name[8:]) + for name, size, hashes in get_tar_hashes(tf, self.hash_functions): try: name = name.decode("utf8") except UnicodeDecodeError: print("warning: skipping filename with encoding error") continue # skip files with non-utf8 encoding for now - callback(dict(name=name, size=size, hashes=hashes)) - break + self.callback(dict(name=name, size=size, hashes=hashes)) + raise ProcessingFinished() + + def handle_ar_end(self): + if self.state != "data": + raise ValueError("data.tar not found") def main(): parser = optparse.OptionParser() @@ -105,7 +108,12 @@ def main(): dumper.open() if options.hash: stdin = HashedStream(stdin, hashlib.sha256()) - process_package(stdin, hash_functions, dumper.represent) + try: + ImportpkgExtractor(hash_functions, dumper.represent).process(stdin) + except ProcessingFinished: + pass + else: + raise RuntimeError("unexpected termination of extractor") if options.hash: stdin.validate(options.hash) dumper.represent("commit") |