summaryrefslogtreecommitdiff
path: root/dedup/debpkg.py
blob: d8cc22f5cec80e65bfb51462ab57193963138bb3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
from debian import deb822

from dedup.hashing import hash_file

def process_control(control_contents):
    """Parses the contents of a control file from a control.tar.gz of a Debian
    package and returns a dictionary containing the fields relevant to dedup.
    @type control_contents: bytes
    @rtype: {str: object}
    """
    control = deb822.Packages(control_contents)
    package = control["package"].encode("ascii")
    try:
        source = control["source"].encode("ascii").split()[0]
    except KeyError:
        source = package
    version = control["version"].encode("ascii")
    architecture = control["architecture"].encode("ascii")

    depends = set(dep[0]["name"].encode("ascii")
                  for dep in control.relations.get("depends", ())
                  if len(dep) == 1)
    return dict(package=package, source=source, version=version,
                architecture=architecture, depends=depends)

class MultiHash(object):
    def __init__(self, *hashes):
        self.hashes = hashes

    def update(self, data):
        for hasher in self.hashes:
            hasher.update(data)

def get_tar_hashes(tar, hash_functions):
    """Given a TarFile read all regular files and compute all of the given hash
    functions on each file.
    @type tar: tarfile.TarFile
    @param hash_functions: a sequence of parameter-less functions each creating a
            new hashlib-like object
    @rtype: gen((str, int, {str: str}}
    @returns: an iterable of (filename, filesize, hashes) tuples where
            hashes is a dict mapping hash function names to hash values
    """

    for elem in tar:
        if not elem.isreg(): # excludes hard links as well
            continue
        hasher = MultiHash(*[func() for func in hash_functions])
        hasher = hash_file(hasher, tar.extractfile(elem))
        hashes = {}
        for hashobj in hasher.hashes:
            hashvalue = hashobj.hexdigest()
            if hashvalue:
                hashes[hashobj.name] = hashvalue
        yield (elem.name, elem.size, hashes)