From 29bdbe1c62acfd2bacac11b17f9b73aa7dbcc381 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 19 Apr 2016 22:48:02 +0200 Subject: add a class DebExtractor for guiding feature extraction It is supposed to separate the parsing of Debian packages (understanding how the format works) from the actual feature extraction. Its goal is to simplify writing custom extractors for different feature sets. --- dedup/debpkg.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'dedup') diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 8f2121b..04773de 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -1,5 +1,6 @@ from debian import deb822 +from dedup.arreader import ArReader from dedup.hashing import hash_file def process_control(control_contents): @@ -53,3 +54,33 @@ def get_tar_hashes(tar, hash_functions): if hashvalue: hashes[hashobj.name] = hashvalue yield (elem.name, elem.size, hashes) + +class DebExtractor(object): + "Base class for extracting desired features from a Debian package." + + def process(self, filelike): + """Process a Debian package. + @param filelike: is a file-like object containing the contents of the + Debian packge and can be read once without seeks. + """ + af = ArReader(filelike) + af.read_magic() + while True: + try: + name = af.read_entry() + except EOFError: + break + else: + self.handle_ar_member(name, af) + self.handle_ar_end() + + def handle_ar_member(self, name, filelike): + """Handle an ar archive member of the Debian package. + @type name: bytes + @param name: is the name of the member + @param filelike: is a file-like object containing the contents of the + member and can be read once without seeks. + """ + + def handle_ar_end(self): + "Handle the end of the ar archive of the Debian package." -- cgit v1.2.3