summaryrefslogtreecommitdiff
path: root/dedup/debpkg.py
blob: cd8616fd35b24e4c0592db5831811a56b7e61697 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from debian import deb822

from dedup.hashing import hash_file

def process_control(control_contents):
    """Parses the contents of a control file from a control.tar of a Debian
    package and returns a dictionary containing the fields relevant to dedup.
    @type control_contents: bytes
    @rtype: {str: object}
    """
    control = deb822.Packages(control_contents)
    package = control["package"].encode("ascii")
    try:
        source = control["source"].encode("ascii").split()[0]
    except KeyError:
        source = package
    version = control["version"].encode("ascii")
    architecture = control["architecture"].encode("ascii")
    # deb822 currently returns :any dependencies raw. see #670679
    depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii")
                  for dep in control.relations.get("depends", ())
                  if len(dep) == 1)
    ret = dict(package=package, source=source, version=version,
               architecture=architecture, depends=depends)
    try:
        ret["multiarch"] = control["multi-arch"].encode("ascii")
    except KeyError:
        pass
    return ret

class MultiHash(object):
    def __init__(self, *hashes):
        self.hashes = hashes

    def update(self, data):
        for hasher in self.hashes:
            hasher.update(data)

def get_tar_hashes(tar, hash_functions):
    """Given a TarFile read all regular files and compute all of the given hash
    functions on each file.
    @type tar: tarfile.TarFile
    @param hash_functions: a sequence of parameter-less functions each creating a
            new hashlib-like object
    @rtype: gen((str, int, {str: str}}
    @returns: an iterable of (filename, filesize, hashes) tuples where
            hashes is a dict mapping hash function names to hash values
    """

    for elem in tar:
        if not elem.isreg(): # excludes hard links as well
            continue
        hasher = MultiHash(*[func() for func in hash_functions])
        hasher = hash_file(hasher, tar.extractfile(elem))
        hashes = {}
        for hashobj in hasher.hashes:
            hashvalue = hashobj.hexdigest()
            if hashvalue:
                hashes[hashobj.name] = hashvalue
        yield (elem.name, elem.size, hashes)