summaryrefslogtreecommitdiff
path: root/importpkg.py
blob: dac4bb1d8bc167237e19f1f1299043ff5b1635c7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
#!/usr/bin/python
"""This tool reads a Debian package from stdin and emits a yaml stream on
stdout.  It does not access a database. Therefore it can be run in parallel and
on multiple machines. The generated yaml contains multiple documents. The first
document contains package metadata. Then a document is emitted for each file.
And finally a document consisting of the string "commit" is emitted."""

import hashlib
import optparse
import sys
import tarfile
import zlib

import yaml

from dedup.debpkg import DebExtractor, process_control, get_tar_hashes
from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
        HashBlacklistContent
from dedup.compression import GzipDecompressor, decompress
from dedup.image import GIFHash, PNGHash

boring_content = set(("", "\n"))

def sha512_nontrivial():
    return HashBlacklistContent(hashlib.sha512(), boring_content)

def gziphash():
    hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
    hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
    hashobj.name = "gzip_sha512"
    return HashBlacklistContent(hashobj, boring_content)

def pnghash():
    hashobj = PNGHash(hashlib.sha512())
    hashobj = SuppressingHash(hashobj, (ValueError,))
    hashobj.name = "png_sha512"
    return hashobj

def gifhash():
    hashobj = GIFHash(hashlib.sha512())
    hashobj = SuppressingHash(hashobj, (ValueError,))
    hashobj.name = "gif_sha512"
    return hashobj

def decompress_tar(filelike, extension):
    filelike = decompress(filelike, extension.decode("ascii"))
    return tarfile.open(fileobj=filelike, mode="r|")

class ProcessingFinished(Exception):
    pass

class ImportpkgExtractor(DebExtractor):
    hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]

    def __init__(self, callback):
        self.state = "start"
        self.callback = callback

    def handle_ar_member(self, name, filelike):
        if name.startswith(b"control.tar"):
            if self.state != "start":
                raise ValueError("unexpected control.tar")
            self.state = "control"
            tf = decompress_tar(filelike, name[11:])
            for elem in tf:
                if elem.name not in ("./control", "control"):
                    continue
                if self.state != "control":
                    raise ValueError("duplicate control file")
                self.state = "control_file"
                self.callback(process_control(tf.extractfile(elem).read()))
                break
        elif name.startswith(b"data.tar"):
            if self.state != "control_file":
                raise ValueError("missing control file")
            self.state = "data"
            tf = decompress_tar(filelike, name[8:])
            for name, size, hashes in get_tar_hashes(tf, self.hash_functions):
                try:
                    name = name.decode("utf8")
                except UnicodeDecodeError:
                    print("warning: skipping filename with encoding error")
                    continue # skip files with non-utf8 encoding for now
                self.callback(dict(name=name, size=size, hashes=hashes))
            raise ProcessingFinished()

    def handle_ar_end(self):
        if self.state != "data":
            raise ValueError("data.tar not found")

def main():
    parser = optparse.OptionParser()
    parser.add_option("-H", "--hash", action="store",
                      help="verify that stdin hash given sha256 hash")
    options, args = parser.parse_args()
    try:
        stdin = sys.stdin.buffer
    except AttributeError: # python2
        stdin = sys.stdin
    dumper = yaml.SafeDumper(sys.stdout)
    dumper.open()
    if options.hash:
        stdin = HashedStream(stdin, hashlib.sha256())
    try:
        ImportpkgExtractor(dumper.represent).process(stdin)
    except ProcessingFinished:
        pass
    else:
        raise RuntimeError("unexpected termination of extractor")
    if options.hash:
        stdin.validate(options.hash)
    dumper.represent("commit")
    dumper.close()

if __name__ == "__main__":
    main()