summaryrefslogtreecommitdiff
path: root/dedup/debpkg.py
diff options
context:
space:
mode:
Diffstat (limited to 'dedup/debpkg.py')
-rw-r--r--dedup/debpkg.py55
1 files changed, 55 insertions, 0 deletions
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
new file mode 100644
index 0000000..d8cc22f
--- /dev/null
+++ b/dedup/debpkg.py
@@ -0,0 +1,55 @@
+from debian import deb822
+
+from dedup.hashing import hash_file
+
+def process_control(control_contents):
+ """Parses the contents of a control file from a control.tar.gz of a Debian
+ package and returns a dictionary containing the fields relevant to dedup.
+ @type control_contents: bytes
+ @rtype: {str: object}
+ """
+ control = deb822.Packages(control_contents)
+ package = control["package"].encode("ascii")
+ try:
+ source = control["source"].encode("ascii").split()[0]
+ except KeyError:
+ source = package
+ version = control["version"].encode("ascii")
+ architecture = control["architecture"].encode("ascii")
+
+ depends = set(dep[0]["name"].encode("ascii")
+ for dep in control.relations.get("depends", ())
+ if len(dep) == 1)
+ return dict(package=package, source=source, version=version,
+ architecture=architecture, depends=depends)
+
+class MultiHash(object):
+ def __init__(self, *hashes):
+ self.hashes = hashes
+
+ def update(self, data):
+ for hasher in self.hashes:
+ hasher.update(data)
+
+def get_tar_hashes(tar, hash_functions):
+ """Given a TarFile read all regular files and compute all of the given hash
+ functions on each file.
+ @type tar: tarfile.TarFile
+ @param hash_functions: a sequence of parameter-less functions each creating a
+ new hashlib-like object
+ @rtype: gen((str, int, {str: str}}
+ @returns: an iterable of (filename, filesize, hashes) tuples where
+ hashes is a dict mapping hash function names to hash values
+ """
+
+ for elem in tar:
+ if not elem.isreg(): # excludes hard links as well
+ continue
+ hasher = MultiHash(*[func() for func in hash_functions])
+ hasher = hash_file(hasher, tar.extractfile(elem))
+ hashes = {}
+ for hashobj in hasher.hashes:
+ hashvalue = hashobj.hexdigest()
+ if hashvalue:
+ hashes[hashobj.name] = hashvalue
+ yield (elem.name, elem.size, hashes)