summaryrefslogtreecommitdiff
path: root/importpkg.py
blob: e8cc2fa20f8a0535a1763af42cc41296e8fd0e27 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
#!/usr/bin/python
"""This tool reads a Debian package from stdin and emits a yaml stream on
stdout.  It does not access a database. Therefore it can be run in parallel and
on multiple machines. The generated yaml contains multiple documents. The first
document contains package metadata. Then a document is emitted for each file.
And finally a document consisting of the string "commit" is emitted."""

import hashlib
import optparse
import sys
import tarfile
import zlib

import yaml

from dedup.debpkg import DebExtractor, process_control, get_tar_hashes
from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
        HashBlacklistContent
from dedup.compression import GzipDecompressor, decompress
from dedup.image import GIFHash, PNGHash

boring_content = set(("", "\n"))

def sha512_nontrivial():
    return HashBlacklistContent(hashlib.sha512(), boring_content)

def gziphash():
    hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
    hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
    hashobj.name = "gzip_sha512"
    return HashBlacklistContent(hashobj, boring_content)

def pnghash():
    hashobj = PNGHash(hashlib.sha512())
    hashobj = SuppressingHash(hashobj, (ValueError,))
    hashobj.name = "png_sha512"
    return hashobj

def gifhash():
    hashobj = GIFHash(hashlib.sha512())
    hashobj = SuppressingHash(hashobj, (ValueError,))
    hashobj.name = "gif_sha512"
    return hashobj

if sys.version_info.major >= 3:
    def decompress_tar(filelike, extension):
        filelike = decompress(filelike, extension.decode("ascii"))
        return tarfile.open(fileobj=filelike, mode="r|")

    def decodetarname(name):
        """Decoded name of a tarinfo.
        @raises UnicodeDecodeError:
        """
        try:
            name.encode("utf8", "strict")
        except UnicodeEncodeError as e:
            if e.reason == "surrogates not allowed":
                name.encode("utf8", "surrogateescape").decode("utf8", "strict")
        return name
else:
    def decompress_tar(filelike, extension):
        filelike = decompress(filelike, extension.decode("ascii"))
        return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
                            errors="surrogateescape")

    def decodetarname(name):
        """Decoded name of a tarinfo.
        @raises UnicodeDecodeError:
        """
        return name.decode("utf8")

class ProcessingFinished(Exception):
    pass

class ImportpkgExtractor(DebExtractor):
    hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]

    def __init__(self, callback):
        self.state = "start"
        self.callback = callback

    def handle_ar_member(self, name, filelike):
        if name.startswith(b"control.tar"):
            if self.state != "start":
                raise ValueError("unexpected control.tar")
            self.state = "control"
            tf = decompress_tar(filelike, name[11:])
            for elem in tf:
                if elem.name not in ("./control", "control"):
                    continue
                if self.state != "control":
                    raise ValueError("duplicate control file")
                self.state = "control_file"
                self.callback(process_control(tf.extractfile(elem).read()))
                break
        elif name.startswith(b"data.tar"):
            if self.state != "control_file":
                raise ValueError("missing control file")
            self.state = "data"
            tf = decompress_tar(filelike, name[8:])
            for name, size, hashes in get_tar_hashes(tf, self.hash_functions):
                try:
                    name = decodetarname(name)
                except UnicodeDecodeError:
                    print("warning: skipping filename with encoding error")
                    continue # skip files with non-utf8 encoding for now
                self.callback(dict(name=name, size=size, hashes=hashes))
            raise ProcessingFinished()

    def handle_ar_end(self):
        if self.state != "data":
            raise ValueError("data.tar not found")

def main():
    parser = optparse.OptionParser()
    parser.add_option("-H", "--hash", action="store",
                      help="verify that stdin hash given sha256 hash")
    options, args = parser.parse_args()
    try:
        stdin = sys.stdin.buffer
    except AttributeError: # python2
        stdin = sys.stdin
    dumper = yaml.SafeDumper(sys.stdout)
    dumper.open()
    if options.hash:
        stdin = HashedStream(stdin, hashlib.sha256())
    try:
        ImportpkgExtractor(dumper.represent).process(stdin)
    except ProcessingFinished:
        pass
    else:
        raise RuntimeError("unexpected termination of extractor")
    if options.hash:
        stdin.validate(options.hash)
    dumper.represent("commit")
    dumper.close()

if __name__ == "__main__":
    main()