summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dedup/debpkg.py31
-rwxr-xr-ximportpkg.py58
2 files changed, 64 insertions, 25 deletions
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index 8f2121b..04773de 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -1,5 +1,6 @@
from debian import deb822
+from dedup.arreader import ArReader
from dedup.hashing import hash_file
def process_control(control_contents):
@@ -53,3 +54,33 @@ def get_tar_hashes(tar, hash_functions):
if hashvalue:
hashes[hashobj.name] = hashvalue
yield (elem.name, elem.size, hashes)
+
+class DebExtractor(object):
+ "Base class for extracting desired features from a Debian package."
+
+ def process(self, filelike):
+ """Process a Debian package.
+ @param filelike: is a file-like object containing the contents of the
+ Debian packge and can be read once without seeks.
+ """
+ af = ArReader(filelike)
+ af.read_magic()
+ while True:
+ try:
+ name = af.read_entry()
+ except EOFError:
+ break
+ else:
+ self.handle_ar_member(name, af)
+ self.handle_ar_end()
+
+ def handle_ar_member(self, name, filelike):
+ """Handle an ar archive member of the Debian package.
+ @type name: bytes
+ @param name: is the name of the member
+ @param filelike: is a file-like object containing the contents of the
+ member and can be read once without seeks.
+ """
+
+ def handle_ar_end(self):
+ "Handle the end of the ar archive of the Debian package."
diff --git a/importpkg.py b/importpkg.py
index f72cf03..0798f13 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -14,8 +14,7 @@ import zlib
import lzma
import yaml
-from dedup.arreader import ArReader
-from dedup.debpkg import process_control, get_tar_hashes
+from dedup.debpkg import DebExtractor, process_control, get_tar_hashes
from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
HashBlacklistContent
from dedup.compression import GzipDecompressor, DecompressedStream
@@ -54,42 +53,46 @@ def decompress_tar(filelike, extension):
return tarfile.open(fileobj=filelike,
mode="r|" + extension[1:].decode("ascii"))
-def process_package(filelike, hash_functions, callback):
- af = ArReader(filelike)
- af.read_magic()
- state = "start"
- while True:
- try:
- name = af.read_entry()
- except EOFError:
- raise ValueError("data.tar not found")
+class ProcessingFinished(Exception):
+ pass
+
+class ImportpkgExtractor(DebExtractor):
+ def __init__(self, hash_functions, callback):
+ self.state = "start"
+ self.hash_functions = hash_functions
+ self.callback = callback
+
+ def handle_ar_member(self, name, filelike):
if name.startswith(b"control.tar"):
- if state != "start":
+ if self.state != "start":
raise ValueError("unexpected control.tar")
- state = "control"
- tf = decompress_tar(af, name[11:])
+ self.state = "control"
+ tf = decompress_tar(filelike, name[11:])
for elem in tf:
if elem.name not in ("./control", "control"):
continue
- if state != "control":
+ if self.state != "control":
raise ValueError("duplicate control file")
- state = "control_file"
- callback(process_control(tf.extractfile(elem).read()))
+ self.state = "control_file"
+ self.callback(process_control(tf.extractfile(elem).read()))
break
- continue
elif name.startswith(b"data.tar"):
- if state != "control_file":
+ if self.state != "control_file":
raise ValueError("missing control file")
- state = "data"
- tf = decompress_tar(af, name[8:])
- for name, size, hashes in get_tar_hashes(tf, hash_functions):
+ self.state = "data"
+ tf = decompress_tar(filelike, name[8:])
+ for name, size, hashes in get_tar_hashes(tf, self.hash_functions):
try:
name = name.decode("utf8")
except UnicodeDecodeError:
print("warning: skipping filename with encoding error")
continue # skip files with non-utf8 encoding for now
- callback(dict(name=name, size=size, hashes=hashes))
- break
+ self.callback(dict(name=name, size=size, hashes=hashes))
+ raise ProcessingFinished()
+
+ def handle_ar_end(self):
+ if self.state != "data":
+ raise ValueError("data.tar not found")
def main():
parser = optparse.OptionParser()
@@ -105,7 +108,12 @@ def main():
dumper.open()
if options.hash:
stdin = HashedStream(stdin, hashlib.sha256())
- process_package(stdin, hash_functions, dumper.represent)
+ try:
+ ImportpkgExtractor(hash_functions, dumper.represent).process(stdin)
+ except ProcessingFinished:
+ pass
+ else:
+ raise RuntimeError("unexpected termination of extractor")
if options.hash:
stdin.validate(options.hash)
dumper.represent("commit")