#!/usr/bin/python """This tool reads a Debian package from stdin and emits a yaml stream on stdout. It does not access a database. Therefore it can be run in parallel and on multiple machines. The generated yaml contains multiple documents. The first document contains package metadata. Then a document is emitted for each file. And finally a document consisting of the string "commit" is emitted.""" import hashlib import optparse import sys import tarfile import zlib import yaml from dedup.debpkg import DebExtractor, process_control, get_tar_hashes from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ HashBlacklistContent from dedup.compression import GzipDecompressor, decompress from dedup.image import GIFHash, PNGHash boring_content = set(("", "\n")) def sha512_nontrivial(): return HashBlacklistContent(hashlib.sha512(), boring_content) def gziphash(): hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) hashobj.name = "gzip_sha512" return HashBlacklistContent(hashobj, boring_content) def pnghash(): hashobj = PNGHash(hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError,)) hashobj.name = "png_sha512" return hashobj def gifhash(): hashobj = GIFHash(hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError,)) hashobj.name = "gif_sha512" return hashobj def decompress_tar(filelike, extension): filelike = decompress(filelike, extension.decode("ascii")) return tarfile.open(fileobj=filelike, mode="r|") class ProcessingFinished(Exception): pass class ImportpkgExtractor(DebExtractor): hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash] def __init__(self, callback): self.state = "start" self.callback = callback def handle_ar_member(self, name, filelike): if name.startswith(b"control.tar"): if self.state != "start": raise ValueError("unexpected control.tar") self.state = "control" tf = decompress_tar(filelike, name[11:]) for elem in tf: if elem.name not in ("./control", "control"): continue if self.state != "control": raise ValueError("duplicate control file") self.state = "control_file" self.callback(process_control(tf.extractfile(elem).read())) break elif name.startswith(b"data.tar"): if self.state != "control_file": raise ValueError("missing control file") self.state = "data" tf = decompress_tar(filelike, name[8:]) for name, size, hashes in get_tar_hashes(tf, self.hash_functions): try: name = name.decode("utf8") except UnicodeDecodeError: print("warning: skipping filename with encoding error") continue # skip files with non-utf8 encoding for now self.callback(dict(name=name, size=size, hashes=hashes)) raise ProcessingFinished() def handle_ar_end(self): if self.state != "data": raise ValueError("data.tar not found") def main(): parser = optparse.OptionParser() parser.add_option("-H", "--hash", action="store", help="verify that stdin hash given sha256 hash") options, args = parser.parse_args() try: stdin = sys.stdin.buffer except AttributeError: # python2 stdin = sys.stdin dumper = yaml.SafeDumper(sys.stdout) dumper.open() if options.hash: stdin = HashedStream(stdin, hashlib.sha256()) try: ImportpkgExtractor(dumper.represent).process(stdin) except ProcessingFinished: pass else: raise RuntimeError("unexpected termination of extractor") if options.hash: stdin.validate(options.hash) dumper.represent("commit") dumper.close() if __name__ == "__main__": main()