1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
|
#!/usr/bin/python
"""This tool reads a Debian package from stdin and emits a yaml stream on
stdout. It does not access a database. Therefore it can be run in parallel and
on multiple machines. The generated yaml contains multiple documents. The first
document contains package metadata. Then a document is emitted for each file.
And finally a document consisting of the string "commit" is emitted."""
import hashlib
import optparse
import sys
import tarfile
import zlib
import yaml
from dedup.debpkg import DebExtractor, process_control, get_tar_hashes
from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
HashBlacklistContent
from dedup.compression import GzipDecompressor, decompress
from dedup.image import GIFHash, PNGHash
boring_content = set(("", "\n"))
def sha512_nontrivial():
return HashBlacklistContent(hashlib.sha512(), boring_content)
def gziphash():
hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
hashobj.name = "gzip_sha512"
return HashBlacklistContent(hashobj, boring_content)
def pnghash():
hashobj = PNGHash(hashlib.sha512())
hashobj = SuppressingHash(hashobj, (ValueError,))
hashobj.name = "png_sha512"
return hashobj
def gifhash():
hashobj = GIFHash(hashlib.sha512())
hashobj = SuppressingHash(hashobj, (ValueError,))
hashobj.name = "gif_sha512"
return hashobj
if sys.version_info.major >= 3:
def decompress_tar(filelike, extension):
filelike = decompress(filelike, extension.decode("ascii"))
return tarfile.open(fileobj=filelike, mode="r|")
def decodetarname(name):
"""Decoded name of a tarinfo.
@raises UnicodeDecodeError:
"""
try:
name.encode("utf8", "strict")
except UnicodeEncodeError as e:
if e.reason == "surrogates not allowed":
name.encode("utf8", "surrogateescape").decode("utf8", "strict")
return name
else:
def decompress_tar(filelike, extension):
filelike = decompress(filelike, extension.decode("ascii"))
return tarfile.open(fileobj=filelike, mode="r|", encoding="utf8",
errors="surrogateescape")
def decodetarname(name):
"""Decoded name of a tarinfo.
@raises UnicodeDecodeError:
"""
return name.decode("utf8")
class ProcessingFinished(Exception):
pass
class ImportpkgExtractor(DebExtractor):
hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash]
def __init__(self, callback):
self.state = "start"
self.callback = callback
def handle_ar_member(self, name, filelike):
if name.startswith(b"control.tar"):
if self.state != "start":
raise ValueError("unexpected control.tar")
self.state = "control"
tf = decompress_tar(filelike, name[11:])
for elem in tf:
if elem.name not in ("./control", "control"):
continue
if self.state != "control":
raise ValueError("duplicate control file")
self.state = "control_file"
self.callback(process_control(tf.extractfile(elem).read()))
break
elif name.startswith(b"data.tar"):
if self.state != "control_file":
raise ValueError("missing control file")
self.state = "data"
tf = decompress_tar(filelike, name[8:])
for name, size, hashes in get_tar_hashes(tf, self.hash_functions):
try:
name = decodetarname(name)
except UnicodeDecodeError:
print("warning: skipping filename with encoding error")
continue # skip files with non-utf8 encoding for now
self.callback(dict(name=name, size=size, hashes=hashes))
raise ProcessingFinished()
def handle_ar_end(self):
if self.state != "data":
raise ValueError("data.tar not found")
def main():
parser = optparse.OptionParser()
parser.add_option("-H", "--hash", action="store",
help="verify that stdin hash given sha256 hash")
options, args = parser.parse_args()
try:
stdin = sys.stdin.buffer
except AttributeError: # python2
stdin = sys.stdin
dumper = yaml.SafeDumper(sys.stdout)
dumper.open()
if options.hash:
stdin = HashedStream(stdin, hashlib.sha256())
try:
ImportpkgExtractor(dumper.represent).process(stdin)
except ProcessingFinished:
pass
else:
raise RuntimeError("unexpected termination of extractor")
if options.hash:
stdin.validate(options.hash)
dumper.represent("commit")
dumper.close()
if __name__ == "__main__":
main()
|