diff options
author | Helmut Grohne <helmut@subdivi.de> | 2016-04-19 22:48:02 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2016-04-19 22:55:37 +0200 |
commit | 29bdbe1c62acfd2bacac11b17f9b73aa7dbcc381 (patch) | |
tree | f9b1d158f3e79964998d886dfbb63009a05baf32 /dedup/debpkg.py | |
parent | 0715cc5f94438d58e2fc59c065a0afbd3dbb525a (diff) | |
download | debian-dedup-29bdbe1c62acfd2bacac11b17f9b73aa7dbcc381.tar.gz |
add a class DebExtractor for guiding feature extraction
It is supposed to separate the parsing of Debian packages (understanding
how the format works) from the actual feature extraction. Its goal is to
simplify writing custom extractors for different feature sets.
Diffstat (limited to 'dedup/debpkg.py')
-rw-r--r-- | dedup/debpkg.py | 31 |
1 files changed, 31 insertions, 0 deletions
diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 8f2121b..04773de 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -1,5 +1,6 @@ from debian import deb822 +from dedup.arreader import ArReader from dedup.hashing import hash_file def process_control(control_contents): @@ -53,3 +54,33 @@ def get_tar_hashes(tar, hash_functions): if hashvalue: hashes[hashobj.name] = hashvalue yield (elem.name, elem.size, hashes) + +class DebExtractor(object): + "Base class for extracting desired features from a Debian package." + + def process(self, filelike): + """Process a Debian package. + @param filelike: is a file-like object containing the contents of the + Debian packge and can be read once without seeks. + """ + af = ArReader(filelike) + af.read_magic() + while True: + try: + name = af.read_entry() + except EOFError: + break + else: + self.handle_ar_member(name, af) + self.handle_ar_end() + + def handle_ar_member(self, name, filelike): + """Handle an ar archive member of the Debian package. + @type name: bytes + @param name: is the name of the member + @param filelike: is a file-like object containing the contents of the + member and can be read once without seeks. + """ + + def handle_ar_end(self): + "Handle the end of the ar archive of the Debian package." |