summaryrefslogtreecommitdiff
path: root/dedup
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2016-04-19 22:48:02 +0200
committerHelmut Grohne <helmut@subdivi.de>2016-04-19 22:55:37 +0200
commit29bdbe1c62acfd2bacac11b17f9b73aa7dbcc381 (patch)
treef9b1d158f3e79964998d886dfbb63009a05baf32 /dedup
parent0715cc5f94438d58e2fc59c065a0afbd3dbb525a (diff)
downloaddebian-dedup-29bdbe1c62acfd2bacac11b17f9b73aa7dbcc381.tar.gz
add a class DebExtractor for guiding feature extraction
It is supposed to separate the parsing of Debian packages (understanding how the format works) from the actual feature extraction. Its goal is to simplify writing custom extractors for different feature sets.
Diffstat (limited to 'dedup')
-rw-r--r--dedup/debpkg.py31
1 files changed, 31 insertions, 0 deletions
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index 8f2121b..04773de 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -1,5 +1,6 @@
from debian import deb822
+from dedup.arreader import ArReader
from dedup.hashing import hash_file
def process_control(control_contents):
@@ -53,3 +54,33 @@ def get_tar_hashes(tar, hash_functions):
if hashvalue:
hashes[hashobj.name] = hashvalue
yield (elem.name, elem.size, hashes)
+
+class DebExtractor(object):
+ "Base class for extracting desired features from a Debian package."
+
+ def process(self, filelike):
+ """Process a Debian package.
+ @param filelike: is a file-like object containing the contents of the
+ Debian packge and can be read once without seeks.
+ """
+ af = ArReader(filelike)
+ af.read_magic()
+ while True:
+ try:
+ name = af.read_entry()
+ except EOFError:
+ break
+ else:
+ self.handle_ar_member(name, af)
+ self.handle_ar_end()
+
+ def handle_ar_member(self, name, filelike):
+ """Handle an ar archive member of the Debian package.
+ @type name: bytes
+ @param name: is the name of the member
+ @param filelike: is a file-like object containing the contents of the
+ member and can be read once without seeks.
+ """
+
+ def handle_ar_end(self):
+ "Handle the end of the ar archive of the Debian package."