diff options
author | Helmut Grohne <helmut@subdivi.de> | 2016-04-19 22:48:02 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2016-04-19 22:55:37 +0200 |
commit | 29bdbe1c62acfd2bacac11b17f9b73aa7dbcc381 (patch) | |
tree | f9b1d158f3e79964998d886dfbb63009a05baf32 /importpkg.py | |
parent | 0715cc5f94438d58e2fc59c065a0afbd3dbb525a (diff) | |
download | debian-dedup-29bdbe1c62acfd2bacac11b17f9b73aa7dbcc381.tar.gz |
add a class DebExtractor for guiding feature extraction
It is supposed to separate the parsing of Debian packages (understanding
how the format works) from the actual feature extraction. Its goal is to
simplify writing custom extractors for different feature sets.
Diffstat (limited to 'importpkg.py')
-rwxr-xr-x | importpkg.py | 58 |
1 files changed, 33 insertions, 25 deletions
diff --git a/importpkg.py b/importpkg.py index f72cf03..0798f13 100755 --- a/importpkg.py +++ b/importpkg.py @@ -14,8 +14,7 @@ import zlib import lzma import yaml -from dedup.arreader import ArReader -from dedup.debpkg import process_control, get_tar_hashes +from dedup.debpkg import DebExtractor, process_control, get_tar_hashes from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ HashBlacklistContent from dedup.compression import GzipDecompressor, DecompressedStream @@ -54,42 +53,46 @@ def decompress_tar(filelike, extension): return tarfile.open(fileobj=filelike, mode="r|" + extension[1:].decode("ascii")) -def process_package(filelike, hash_functions, callback): - af = ArReader(filelike) - af.read_magic() - state = "start" - while True: - try: - name = af.read_entry() - except EOFError: - raise ValueError("data.tar not found") +class ProcessingFinished(Exception): + pass + +class ImportpkgExtractor(DebExtractor): + def __init__(self, hash_functions, callback): + self.state = "start" + self.hash_functions = hash_functions + self.callback = callback + + def handle_ar_member(self, name, filelike): if name.startswith(b"control.tar"): - if state != "start": + if self.state != "start": raise ValueError("unexpected control.tar") - state = "control" - tf = decompress_tar(af, name[11:]) + self.state = "control" + tf = decompress_tar(filelike, name[11:]) for elem in tf: if elem.name not in ("./control", "control"): continue - if state != "control": + if self.state != "control": raise ValueError("duplicate control file") - state = "control_file" - callback(process_control(tf.extractfile(elem).read())) + self.state = "control_file" + self.callback(process_control(tf.extractfile(elem).read())) break - continue elif name.startswith(b"data.tar"): - if state != "control_file": + if self.state != "control_file": raise ValueError("missing control file") - state = "data" - tf = decompress_tar(af, name[8:]) - for name, size, hashes in get_tar_hashes(tf, hash_functions): + self.state = "data" + tf = decompress_tar(filelike, name[8:]) + for name, size, hashes in get_tar_hashes(tf, self.hash_functions): try: name = name.decode("utf8") except UnicodeDecodeError: print("warning: skipping filename with encoding error") continue # skip files with non-utf8 encoding for now - callback(dict(name=name, size=size, hashes=hashes)) - break + self.callback(dict(name=name, size=size, hashes=hashes)) + raise ProcessingFinished() + + def handle_ar_end(self): + if self.state != "data": + raise ValueError("data.tar not found") def main(): parser = optparse.OptionParser() @@ -105,7 +108,12 @@ def main(): dumper.open() if options.hash: stdin = HashedStream(stdin, hashlib.sha256()) - process_package(stdin, hash_functions, dumper.represent) + try: + ImportpkgExtractor(hash_functions, dumper.represent).process(stdin) + except ProcessingFinished: + pass + else: + raise RuntimeError("unexpected termination of extractor") if options.hash: stdin.validate(options.hash) dumper.represent("commit") |