1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
|
#!/usr/bin/python
"""This tool reads a Debian package from stdin and emits a yaml stream on
stdout. It does not access a database. Therefore it can be run in parallel and
on multiple machines. The generated yaml contains multiple documents. The first
document contains package metadata. Then a document is emitted for each file.
And finally a document consisting of the string "commit" is emitted."""
import argparse
import hashlib
import sys
import zlib
try:
from urllib.request import urlopen
except ImportError:
from urllib import urlopen
import yaml
from dedup.debpkg import DebExtractor, decodetarname, get_tar_hashes, \
process_control
from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
HashBlacklistContent
from dedup.compression import GzipDecompressor
from dedup.image import GIFHash, PNGHash
boring_content = set(("", "\n"))
def sha512_nontrivial():
return HashBlacklistContent(hashlib.sha512(), boring_content)
def gziphash():
hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
hashobj.name = "gzip_sha512"
return HashBlacklistContent(hashobj, boring_content)
def pnghash():
hashobj = PNGHash(hashlib.sha512())
hashobj = SuppressingHash(hashobj, (ValueError,))
hashobj.name = "png_sha512"
return hashobj
def gifhash():
hashobj = GIFHash(hashlib.sha512())
hashobj = SuppressingHash(hashobj, (ValueError,))
hashobj.name = "gif_sha512"
return hashobj
class ProcessingFinished(Exception):
pass
class ImportpkgExtractor(DebExtractor):
hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
def __init__(self, callback):
DebExtractor.__init__(self)
self.callback = callback
def handle_control_tar(self, tarfileobj):
for elem in tarfileobj:
if elem.name not in ("./control", "control"):
continue
self.callback(process_control(tarfileobj.extractfile(elem).read()))
return
raise ValueError("missing control file")
def handle_data_tar(self, tarfileobj):
for name, size, hashes in get_tar_hashes(tarfileobj,
self.hash_functions):
try:
name = decodetarname(name)
except UnicodeDecodeError:
print("warning: skipping filename with encoding error")
continue # skip files with non-utf8 encoding for now
self.callback(dict(name=name, size=size, hashes=hashes))
raise ProcessingFinished()
def main():
try:
stdin = sys.stdin.buffer
except AttributeError: # python2
stdin = sys.stdin
parser = argparse.ArgumentParser()
parser.add_argument("-H", "--hash", action="store",
help="verify that stdin hash given sha256 hash")
parser.add_argument("input", nargs='?', default=stdin, type=urlopen,
help="read from this location instead of stdin")
args = parser.parse_args()
dumper = yaml.SafeDumper(sys.stdout)
dumper.open()
if args.hash:
args.input = HashedStream(args.input, hashlib.sha256())
try:
ImportpkgExtractor(dumper.represent).process(args.input)
except ProcessingFinished:
pass
else:
raise RuntimeError("unexpected termination of extractor")
if args.hash:
args.input.validate(args.hash)
dumper.represent("commit")
dumper.close()
if __name__ == "__main__":
main()
|