diff options
author | Helmut Grohne <helmut@subdivi.de> | 2014-12-12 13:28:02 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2014-12-12 13:28:02 +0100 |
commit | 36fd6fc4bd1c6930c77aa5b6408a832c1e651ef6 (patch) | |
tree | b1a9fcedb51aee454f42e5d1afa8b94db0ccafea /importpkg.py | |
parent | 2a728ab85e1ddfeec03514f86f706c116ca94440 (diff) | |
download | debian-dedup-36fd6fc4bd1c6930c77aa5b6408a832c1e651ef6.tar.gz |
full text searching on control.tar memberscontroldata
This is a rather strange variant that has nothing to do with
deduplication anymore. Instead, it enables searching a sqlite fts4 table
containing all members of control.tars.
Diffstat (limited to 'importpkg.py')
-rwxr-xr-x | importpkg.py | 71 |
1 files changed, 24 insertions, 47 deletions
diff --git a/importpkg.py b/importpkg.py index 7e074e1..bef0be0 100755 --- a/importpkg.py +++ b/importpkg.py @@ -5,6 +5,7 @@ on multiple machines. The generated yaml contains multiple documents. The first document contains package metadata. Then a document is emitted for each file. And finally a document consisting of the string "commit" is emitted.""" +import binascii import hashlib import optparse import sys @@ -23,27 +24,6 @@ from dedup.image import GIFHash, PNGHash boring_content = set(("", "\n")) -def sha512_nontrivial(): - return HashBlacklistContent(hashlib.sha512(), boring_content) - -def gziphash(): - hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) - hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) - hashobj.name = "gzip_sha512" - return HashBlacklistContent(hashobj, boring_content) - -def pnghash(): - hashobj = PNGHash(hashlib.sha512()) - hashobj = SuppressingHash(hashobj, (ValueError,)) - hashobj.name = "png_sha512" - return hashobj - -def gifhash(): - hashobj = GIFHash(hashlib.sha512()) - hashobj = SuppressingHash(hashobj, (ValueError,)) - hashobj.name = "gif_sha512" - return hashobj - def decompress_tar(filelike, extension): if extension in (".lzma", ".xz"): filelike = DecompressedStream(filelike, lzma.LZMADecompressor()) @@ -53,47 +33,45 @@ def decompress_tar(filelike, extension): extension) return tarfile.open(fileobj=filelike, mode="r|" + extension[1:]) -def process_package(filelike, hash_functions): +def process_package(filelike): af = ArReader(filelike) af.read_magic() state = "start" + record = None while True: try: name = af.read_entry() except EOFError: - raise ValueError("data.tar not found") + raise ValueError("control.tar not found") if name.startswith("control.tar"): if state != "start": raise ValueError("unexpected control.tar") state = "control" tf = decompress_tar(af, name[11:]) + controldata = {} for elem in tf: - if elem.name not in ("./control", "control"): + if not elem.isreg(): continue - if state != "control": - raise ValueError("duplicate control file") - state = "control_file" - yield process_control(tf.extractfile(elem).read()) - break - continue - elif name.startswith("data.tar"): - if state != "control_file": - raise ValueError("missing control file") - state = "data" - tf = decompress_tar(af, name[8:]) - for name, size, hashes in get_tar_hashes(tf, hash_functions): - try: - name = name.decode("utf8") - except UnicodeDecodeError: - print("warning: skipping filename with encoding error") - continue # skip files with non-utf8 encoding for now - yield dict(name=name, size=size, hashes=hashes) + elemname = elem.name + if elemname.startswith("./"): + elemname = elemname[2:] + content = tf.extractfile(elem).read() + if elemname in controldata: + raise ValueError("duplicate entry %r in control.tar" % + elemname) + controldata[elemname] = binascii.b2a_base64(content).strip() + if elemname == "control": + record = process_control(content) + if record is None: + raise ValueError("control file missing from control.tar") + record["data"] = controldata + yield record yield "commit" break -def process_package_with_hash(filelike, hash_functions, sha256hash): +def process_package_with_hash(filelike, sha256hash): hstream = HashedStream(filelike, hashlib.sha256()) - for elem in process_package(hstream, hash_functions): + for elem in process_package(hstream): if elem == "commit": while hstream.read(4096): pass @@ -108,11 +86,10 @@ def main(): parser.add_option("-H", "--hash", action="store", help="verify that stdin hash given sha256 hash") options, args = parser.parse_args() - hash_functions = [sha512_nontrivial, gziphash, pnghash, gifhash] if options.hash: - gen = process_package_with_hash(sys.stdin, hash_functions, options.hash) + gen = process_package_with_hash(sys.stdin, options.hash) else: - gen = process_package(sys.stdin, hash_functions) + gen = process_package(sys.stdin) yaml.safe_dump_all(gen, sys.stdout) if __name__ == "__main__": |