1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
|
#!/usr/bin/python
"""This tool reads a Debian package from stdin and emits a yaml stream on
stdout. It does not access a database. Therefore it can be run in parallel and
on multiple machines. The generated yaml contains multiple documents. The first
document contains package metadata. Then a document is emitted for each file.
And finally a document consisting of the string "commit" is emitted."""
import binascii
import hashlib
import optparse
import sys
import tarfile
import zlib
import lzma
import yaml
from dedup.arreader import ArReader
from dedup.debpkg import process_control, get_tar_hashes
from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
HashBlacklistContent
from dedup.compression import GzipDecompressor, DecompressedStream
from dedup.image import GIFHash, PNGHash
boring_content = set(("", "\n"))
def decompress_tar(filelike, extension):
if extension in (".lzma", ".xz"):
filelike = DecompressedStream(filelike, lzma.LZMADecompressor())
extension = ""
if extension not in ("", ".gz", ".bz2"):
raise ValueError("unknown compression format with extension %r" %
extension)
return tarfile.open(fileobj=filelike, mode="r|" + extension[1:])
def process_package(filelike):
af = ArReader(filelike)
af.read_magic()
state = "start"
record = None
while True:
try:
name = af.read_entry()
except EOFError:
raise ValueError("control.tar not found")
if name.startswith("control.tar"):
if state != "start":
raise ValueError("unexpected control.tar")
state = "control"
tf = decompress_tar(af, name[11:])
controldata = {}
for elem in tf:
if not elem.isreg():
continue
elemname = elem.name
if elemname.startswith("./"):
elemname = elemname[2:]
content = tf.extractfile(elem).read()
if elemname in controldata:
raise ValueError("duplicate entry %r in control.tar" %
elemname)
controldata[elemname] = binascii.b2a_base64(content).strip()
if elemname == "control":
record = process_control(content)
if record is None:
raise ValueError("control file missing from control.tar")
record["data"] = controldata
yield record
yield "commit"
break
def process_package_with_hash(filelike, sha256hash):
hstream = HashedStream(filelike, hashlib.sha256())
for elem in process_package(hstream):
if elem == "commit":
while hstream.read(4096):
pass
if hstream.hexdigest() != sha256hash:
raise ValueError("hash sum mismatch")
yield elem
break
yield elem
def main():
parser = optparse.OptionParser()
parser.add_option("-H", "--hash", action="store",
help="verify that stdin hash given sha256 hash")
options, args = parser.parse_args()
if options.hash:
gen = process_package_with_hash(sys.stdin, options.hash)
else:
gen = process_package(sys.stdin)
yaml.safe_dump_all(gen, sys.stdout)
if __name__ == "__main__":
main()
|