summaryrefslogtreecommitdiff
path: root/importpkg.py
blob: 54f6181939abb7614f94c2eaca9d1c5f8182a523 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/python
"""This tool reads a debian package from stdin and emits a yaml stream on
stdout.  It does not access a database. Therefore it can be run in parallel and
on multiple machines. The generated yaml conatins multiple documents. The first
document contains package metadata. Then a document is emitted for each file.
And finally a document consisting of the string "commit" is emitted."""

import hashlib
import optparse
import sys
import tarfile
import zlib

import lzma
import yaml

from dedup.arreader import ArReader
from dedup.debpkg import process_control, get_tar_hashes
from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
    HashedStream
from dedup.compression import GzipDecompressor, DecompressedStream
from dedup.image import GIFHash, PNGHash

boring_sha512_hashes = set((
    # ""
    "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
    # "\n"
    "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))

def sha512_nontrivial():
    return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)

def gziphash():
    hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
    hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
    hashobj.name = "gzip_sha512"
    # don't blacklist boring hashes for gzip to get gzip issues right
    return hashobj

def pnghash():
    hashobj = PNGHash(hashlib.sha512())
    hashobj = SuppressingHash(hashobj, (ValueError,))
    hashobj.name = "png_sha512"
    return hashobj

def gifhash():
    hashobj = GIFHash(hashlib.sha512())
    hashobj = SuppressingHash(hashobj, (ValueError,))
    hashobj.name = "gif_sha512"
    return hashobj

def process_package(filelike, hash_functions):
    af = ArReader(filelike)
    af.read_magic()
    state = "start"
    while True:
        try:
            name = af.read_entry()
        except EOFError:
            raise ValueError("data.tar not found")
        if name == "control.tar.gz":
            if state != "start":
                raise ValueError("unexpected control.tar.gz")
            state = "control"
            tf = tarfile.open(fileobj=af, mode="r|gz")
            for elem in tf:
                if elem.name != "./control":
                    continue
                if state != "control":
                    raise ValueError("duplicate control file")
                state = "control_file"
                yield process_control(tf.extractfile(elem).read())
                break
            continue
        elif name == "data.tar.gz":
            tf = tarfile.open(fileobj=af, mode="r|gz")
        elif name == "data.tar.bz2":
            tf = tarfile.open(fileobj=af, mode="r|bz2")
        elif name == "data.tar.xz":
            zf = DecompressedStream(af, lzma.LZMADecompressor())
            tf = tarfile.open(fileobj=zf, mode="r|")
        elif name == "data.tar":
            tf = tarfile.open(fileobj=af, mode="r|")
        else:
            continue
        if state != "control_file":
            raise ValueError("missing control file")
        for name, size, hashes in get_tar_hashes(tf, hash_functions):
            try:
                name = name.decode("utf8")
            except UnicodeDecodeError:
                print("warning: skipping filename with encoding error")
                continue # skip files with non-utf8 encoding for now
            yield dict(name=name, size=size, hashes=hashes)
        yield "commit"
        break

def process_package_with_hash(filelike, hash_functions, sha256hash):
    hstream = HashedStream(filelike, hashlib.sha256())
    for elem in process_package(hstream, hash_functions):
        if elem == "commit":
            while hstream.read(4096):
                pass
            if hstream.hexdigest() != sha256hash:
                raise ValueError("hash sum mismatch")
            yield elem
            break
        yield elem

def main():
    parser = optparse.OptionParser()
    parser.add_option("-H", "--hash", action="store",
                      help="verify that stdin hash given sha256 hash")
    options, args = parser.parse_args()
    hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
    if options.hash:
        gen = process_package_with_hash(sys.stdin, hash_functions, options.hash)
    else:
        gen = process_package(sys.stdin, hash_functions)
    yaml.safe_dump_all(gen, sys.stdout)

if __name__ == "__main__":
    main()