summaryrefslogtreecommitdiff
path: root/dedup/debpkg.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2014-03-08 12:39:32 +0100
committerHelmut Grohne <helmut@subdivi.de>2014-03-08 12:39:32 +0100
commitc6a30cefff55cd247a47fa0a2d4f819592e1202b (patch)
tree58b6ff52bc6827782c2973f1ce976e245ce5f34c /dedup/debpkg.py
parent751f19ec1107c9059ae4834e4b757741ebee6cbd (diff)
parentbb0aea9971bc79d8787d8f034022d0ca803fcab3 (diff)
downloaddebian-dedup-c6a30cefff55cd247a47fa0a2d4f819592e1202b.tar.gz
Merge branch 'master' into sqlalchemy
In the mean time, the master branch evolved quite a bit and the schema changed again (eqclass added to function table). The main reason for the merge is to resolve the large amounts of conflicts once, so development of the sqlalchemy branch can continue and still benefit from changes in the master branch such as schema compatibility, adapting the indent level in web app due to the use of contextlib.closing which resembles sqlalchemy's "with db.begin() as conn:". Conflicts: autoimport.py dedup/utils.py readyaml.py update_sharing.py webapp.py
Diffstat (limited to 'dedup/debpkg.py')
-rw-r--r--dedup/debpkg.py55
1 files changed, 55 insertions, 0 deletions
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
new file mode 100644
index 0000000..2d67135
--- /dev/null
+++ b/dedup/debpkg.py
@@ -0,0 +1,55 @@
+from debian import deb822
+
+from dedup.hashing import hash_file
+
+def process_control(control_contents):
+ """Parses the contents of a control file from a control.tar.gz of a Debian
+ package and returns a dictionary containing the fields relevant to dedup.
+ @type control_contents: bytes
+ @rtype: {str: object}
+ """
+ control = deb822.Packages(control_contents)
+ package = control["package"].encode("ascii")
+ try:
+ source = control["source"].encode("ascii").split()[0]
+ except KeyError:
+ source = package
+ version = control["version"].encode("ascii")
+ architecture = control["architecture"].encode("ascii")
+ # deb822 currently returns :any dependencies raw. see #670679
+ depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii")
+ for dep in control.relations.get("depends", ())
+ if len(dep) == 1)
+ return dict(package=package, source=source, version=version,
+ architecture=architecture, depends=depends)
+
+class MultiHash(object):
+ def __init__(self, *hashes):
+ self.hashes = hashes
+
+ def update(self, data):
+ for hasher in self.hashes:
+ hasher.update(data)
+
+def get_tar_hashes(tar, hash_functions):
+ """Given a TarFile read all regular files and compute all of the given hash
+ functions on each file.
+ @type tar: tarfile.TarFile
+ @param hash_functions: a sequence of parameter-less functions each creating a
+ new hashlib-like object
+ @rtype: gen((str, int, {str: str}}
+ @returns: an iterable of (filename, filesize, hashes) tuples where
+ hashes is a dict mapping hash function names to hash values
+ """
+
+ for elem in tar:
+ if not elem.isreg(): # excludes hard links as well
+ continue
+ hasher = MultiHash(*[func() for func in hash_functions])
+ hasher = hash_file(hasher, tar.extractfile(elem))
+ hashes = {}
+ for hashobj in hasher.hashes:
+ hashvalue = hashobj.hexdigest()
+ if hashvalue:
+ hashes[hashobj.name] = hashvalue
+ yield (elem.name, elem.size, hashes)