summaryrefslogtreecommitdiff
path: root/importpkg.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2016-04-19 22:48:02 +0200
committerHelmut Grohne <helmut@subdivi.de>2016-04-19 22:55:37 +0200
commit29bdbe1c62acfd2bacac11b17f9b73aa7dbcc381 (patch)
treef9b1d158f3e79964998d886dfbb63009a05baf32 /importpkg.py
parent0715cc5f94438d58e2fc59c065a0afbd3dbb525a (diff)
downloaddebian-dedup-29bdbe1c62acfd2bacac11b17f9b73aa7dbcc381.tar.gz
add a class DebExtractor for guiding feature extraction
It is supposed to separate the parsing of Debian packages (understanding how the format works) from the actual feature extraction. Its goal is to simplify writing custom extractors for different feature sets.
Diffstat (limited to 'importpkg.py')
-rwxr-xr-ximportpkg.py58
1 files changed, 33 insertions, 25 deletions
diff --git a/importpkg.py b/importpkg.py
index f72cf03..0798f13 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -14,8 +14,7 @@ import zlib
import lzma
import yaml
-from dedup.arreader import ArReader
-from dedup.debpkg import process_control, get_tar_hashes
+from dedup.debpkg import DebExtractor, process_control, get_tar_hashes
from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
HashBlacklistContent
from dedup.compression import GzipDecompressor, DecompressedStream
@@ -54,42 +53,46 @@ def decompress_tar(filelike, extension):
return tarfile.open(fileobj=filelike,
mode="r|" + extension[1:].decode("ascii"))
-def process_package(filelike, hash_functions, callback):
- af = ArReader(filelike)
- af.read_magic()
- state = "start"
- while True:
- try:
- name = af.read_entry()
- except EOFError:
- raise ValueError("data.tar not found")
+class ProcessingFinished(Exception):
+ pass
+
+class ImportpkgExtractor(DebExtractor):
+ def __init__(self, hash_functions, callback):
+ self.state = "start"
+ self.hash_functions = hash_functions
+ self.callback = callback
+
+ def handle_ar_member(self, name, filelike):
if name.startswith(b"control.tar"):
- if state != "start":
+ if self.state != "start":
raise ValueError("unexpected control.tar")
- state = "control"
- tf = decompress_tar(af, name[11:])
+ self.state = "control"
+ tf = decompress_tar(filelike, name[11:])
for elem in tf:
if elem.name not in ("./control", "control"):
continue
- if state != "control":
+ if self.state != "control":
raise ValueError("duplicate control file")
- state = "control_file"
- callback(process_control(tf.extractfile(elem).read()))
+ self.state = "control_file"
+ self.callback(process_control(tf.extractfile(elem).read()))
break
- continue
elif name.startswith(b"data.tar"):
- if state != "control_file":
+ if self.state != "control_file":
raise ValueError("missing control file")
- state = "data"
- tf = decompress_tar(af, name[8:])
- for name, size, hashes in get_tar_hashes(tf, hash_functions):
+ self.state = "data"
+ tf = decompress_tar(filelike, name[8:])
+ for name, size, hashes in get_tar_hashes(tf, self.hash_functions):
try:
name = name.decode("utf8")
except UnicodeDecodeError:
print("warning: skipping filename with encoding error")
continue # skip files with non-utf8 encoding for now
- callback(dict(name=name, size=size, hashes=hashes))
- break
+ self.callback(dict(name=name, size=size, hashes=hashes))
+ raise ProcessingFinished()
+
+ def handle_ar_end(self):
+ if self.state != "data":
+ raise ValueError("data.tar not found")
def main():
parser = optparse.OptionParser()
@@ -105,7 +108,12 @@ def main():
dumper.open()
if options.hash:
stdin = HashedStream(stdin, hashlib.sha256())
- process_package(stdin, hash_functions, dumper.represent)
+ try:
+ ImportpkgExtractor(hash_functions, dumper.represent).process(stdin)
+ except ProcessingFinished:
+ pass
+ else:
+ raise RuntimeError("unexpected termination of extractor")
if options.hash:
stdin.validate(options.hash)
dumper.represent("commit")