From 332ac9eafb235443f163c606ced95dcbd615815e Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 19 Feb 2014 14:21:20 +0100 Subject: blacklist content rather than hashes Otherwise the gzip hash cannot tell the empty stream and the compressed empty stream apart. --- importpkg.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'importpkg.py') diff --git a/importpkg.py b/importpkg.py index 54f6181..cb16f97 100755 --- a/importpkg.py +++ b/importpkg.py @@ -16,26 +16,21 @@ import yaml from dedup.arreader import ArReader from dedup.debpkg import process_control, get_tar_hashes -from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ - HashedStream +from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ + HashBlacklistContent from dedup.compression import GzipDecompressor, DecompressedStream from dedup.image import GIFHash, PNGHash -boring_sha512_hashes = set(( - # "" - "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", - # "\n" - "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09")) +boring_content = set(("", "\n")) def sha512_nontrivial(): - return HashBlacklist(hashlib.sha512(), boring_sha512_hashes) + return HashBlacklistContent(hashlib.sha512(), boring_content) def gziphash(): hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) hashobj.name = "gzip_sha512" - # don't blacklist boring hashes for gzip to get gzip issues right - return hashobj + return HashBlacklistContent(hashobj, boring_content) def pnghash(): hashobj = PNGHash(hashlib.sha512()) -- cgit v1.2.3 From 8d4c5512edbdcdd1063a7e6508f398a5a57981be Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 23 Feb 2014 18:19:35 +0100 Subject: spell check comments --- dedup/hashing.py | 2 +- dedup/image.py | 2 +- importpkg.py | 4 ++-- webapp.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'importpkg.py') diff --git a/dedup/hashing.py b/dedup/hashing.py index 70f6268..a8a46c7 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -115,7 +115,7 @@ class DecompressedHash(object): class SuppressingHash(object): """A hash that silences exceptions from the update and hexdigest methods of - a hashlib-like object. If an exception has occured, hexdigest always + a hashlib-like object. If an exception has occurred, hexdigest always returns None.""" def __init__(self, hashobj, exceptions=()): """ diff --git a/dedup/image.py b/dedup/image.py index c1f2de0..ef17989 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -4,7 +4,7 @@ import struct import PIL.Image class ImageHash(object): - """A hash on the contents of an image datat type supported by PIL. This + """A hash on the contents of an image data type supported by PIL. This disregards mode, depth and meta information. Note that due to limitations in PIL and the image format (interlacing) the full contents are stored and decoded in hexdigest.""" diff --git a/importpkg.py b/importpkg.py index cb16f97..aeccda5 100755 --- a/importpkg.py +++ b/importpkg.py @@ -1,7 +1,7 @@ #!/usr/bin/python -"""This tool reads a debian package from stdin and emits a yaml stream on +"""This tool reads a Debian package from stdin and emits a yaml stream on stdout. It does not access a database. Therefore it can be run in parallel and -on multiple machines. The generated yaml conatins multiple documents. The first +on multiple machines. The generated yaml contains multiple documents. The first document contains package metadata. Then a document is emitted for each file. And finally a document consisting of the string "commit" is emitted.""" diff --git a/webapp.py b/webapp.py index fd6d685..2fd69bb 100755 --- a/webapp.py +++ b/webapp.py @@ -151,7 +151,7 @@ class Application(object): return html_response(package_template.render(params)) def compute_comparison(self, pid1, pid2): - """Compute a sequence of comparison objects ordery by the size of the + """Compute a sequence of comparison objects ordered by the size of the object in the first package. Each element of the sequence is a dict defining the following keys: * filenames: A set of filenames in package 1 (pid1) all referring to -- cgit v1.2.3 From ddaa08f7a63a1fedf4c1f2804873199dd5182142 Mon Sep 17 00:00:00 2001 From: Guillem Jover Date: Wed, 7 May 2014 21:06:38 +0200 Subject: importpkg: add support for control.tar and control.tar.xz dpkg supports those since 1.17.6. Signed-off-by: Guillem Jover --- dedup/debpkg.py | 2 +- importpkg.py | 61 +++++++++++++++++++++++++++++++++++---------------------- 2 files changed, 39 insertions(+), 24 deletions(-) (limited to 'importpkg.py') diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 2d67135..dbee849 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -3,7 +3,7 @@ from debian import deb822 from dedup.hashing import hash_file def process_control(control_contents): - """Parses the contents of a control file from a control.tar.gz of a Debian + """Parses the contents of a control file from a control.tar of a Debian package and returns a dictionary containing the fields relevant to dedup. @type control_contents: bytes @rtype: {str: object} diff --git a/importpkg.py b/importpkg.py index aeccda5..7482c4f 100755 --- a/importpkg.py +++ b/importpkg.py @@ -54,41 +54,56 @@ def process_package(filelike, hash_functions): except EOFError: raise ValueError("data.tar not found") if name == "control.tar.gz": - if state != "start": - raise ValueError("unexpected control.tar.gz") - state = "control" + new_state = "control" tf = tarfile.open(fileobj=af, mode="r|gz") - for elem in tf: - if elem.name != "./control": - continue - if state != "control": - raise ValueError("duplicate control file") - state = "control_file" - yield process_control(tf.extractfile(elem).read()) - break - continue + elif name == "control.tar.xz": + new_state = "control" + zf = DecompressedStream(af, lzma.LZMADecompressor()) + tf = tarfile.open(fileobj=zf, mode="r|") + elif name == "control.tar": + new_state = "control" + tf = tarfile.open(fileobj=af, mode="r|") elif name == "data.tar.gz": + new_state = "data" tf = tarfile.open(fileobj=af, mode="r|gz") elif name == "data.tar.bz2": + new_state = "data" tf = tarfile.open(fileobj=af, mode="r|bz2") elif name == "data.tar.xz": + new_state = "data" zf = DecompressedStream(af, lzma.LZMADecompressor()) tf = tarfile.open(fileobj=zf, mode="r|") elif name == "data.tar": + new_state = "data" tf = tarfile.open(fileobj=af, mode="r|") else: continue - if state != "control_file": - raise ValueError("missing control file") - for name, size, hashes in get_tar_hashes(tf, hash_functions): - try: - name = name.decode("utf8") - except UnicodeDecodeError: - print("warning: skipping filename with encoding error") - continue # skip files with non-utf8 encoding for now - yield dict(name=name, size=size, hashes=hashes) - yield "commit" - break + if new_state == "control": + if state != "start": + raise ValueError("unexpected control.tar") + state = new_state + for elem in tf: + if elem.name != "./control": + continue + if state != "control": + raise ValueError("duplicate control file") + state = "control_file" + yield process_control(tf.extractfile(elem).read()) + break + continue + elif new_state == "data": + if state != "control_file": + raise ValueError("missing control file") + state = new_state + for name, size, hashes in get_tar_hashes(tf, hash_functions): + try: + name = name.decode("utf8") + except UnicodeDecodeError: + print("warning: skipping filename with encoding error") + continue # skip files with non-utf8 encoding for now + yield dict(name=name, size=size, hashes=hashes) + yield "commit" + break def process_package_with_hash(filelike, hash_functions, sha256hash): hstream = HashedStream(filelike, hashlib.sha256()) -- cgit v1.2.3 From e2d7f5e1e1ef06c28ca432bc070595b87d78ac85 Mon Sep 17 00:00:00 2001 From: Guillem Jover Date: Thu, 8 May 2014 01:50:48 +0200 Subject: importpkg: add support for data.tar.lzma Creating packages with lzma compression has been deprecated since dpkg 1.16.4, but there might be some of those in the wild and supporting them is strightforward when xz is already supported. Signed-off-by: Guillem Jover --- importpkg.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'importpkg.py') diff --git a/importpkg.py b/importpkg.py index 7482c4f..247965f 100755 --- a/importpkg.py +++ b/importpkg.py @@ -69,6 +69,10 @@ def process_package(filelike, hash_functions): elif name == "data.tar.bz2": new_state = "data" tf = tarfile.open(fileobj=af, mode="r|bz2") + elif name == "data.tar.lzma": + new_state = "data" + zf = DecompressedStream(af, lzma.LZMADecompressor()) + tf = tarfile.open(fileobj=zf, mode="r|") elif name == "data.tar.xz": new_state = "data" zf = DecompressedStream(af, lzma.LZMADecompressor()) -- cgit v1.2.3 From 0097cdc7ffa881427f72ac35428de4214a26d834 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 11 May 2014 15:57:36 +0200 Subject: importpkg: reduce copy&paste --- importpkg.py | 48 +++++++++++++++--------------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) (limited to 'importpkg.py') diff --git a/importpkg.py b/importpkg.py index 247965f..06d9da4 100755 --- a/importpkg.py +++ b/importpkg.py @@ -44,6 +44,15 @@ def gifhash(): hashobj.name = "gif_sha512" return hashobj +def decompress_tar(filelike, extension): + if extension in (".lzma", ".xz"): + filelike = DecompressedStream(filelike, lzma.LZMADecompressor()) + extension = "" + if extension not in ("", ".gz", ".bz2"): + raise ValueError("unknown compression format with extension %r" % + extension) + return tarfile.open(fileobj=filelike, mode="r|" + extension[1:]) + def process_package(filelike, hash_functions): af = ArReader(filelike) af.read_magic() @@ -53,39 +62,11 @@ def process_package(filelike, hash_functions): name = af.read_entry() except EOFError: raise ValueError("data.tar not found") - if name == "control.tar.gz": - new_state = "control" - tf = tarfile.open(fileobj=af, mode="r|gz") - elif name == "control.tar.xz": - new_state = "control" - zf = DecompressedStream(af, lzma.LZMADecompressor()) - tf = tarfile.open(fileobj=zf, mode="r|") - elif name == "control.tar": - new_state = "control" - tf = tarfile.open(fileobj=af, mode="r|") - elif name == "data.tar.gz": - new_state = "data" - tf = tarfile.open(fileobj=af, mode="r|gz") - elif name == "data.tar.bz2": - new_state = "data" - tf = tarfile.open(fileobj=af, mode="r|bz2") - elif name == "data.tar.lzma": - new_state = "data" - zf = DecompressedStream(af, lzma.LZMADecompressor()) - tf = tarfile.open(fileobj=zf, mode="r|") - elif name == "data.tar.xz": - new_state = "data" - zf = DecompressedStream(af, lzma.LZMADecompressor()) - tf = tarfile.open(fileobj=zf, mode="r|") - elif name == "data.tar": - new_state = "data" - tf = tarfile.open(fileobj=af, mode="r|") - else: - continue - if new_state == "control": + if name.startswith("control.tar"): if state != "start": raise ValueError("unexpected control.tar") - state = new_state + state = "control" + tf = decompress_tar(af, name[11:]) for elem in tf: if elem.name != "./control": continue @@ -95,10 +76,11 @@ def process_package(filelike, hash_functions): yield process_control(tf.extractfile(elem).read()) break continue - elif new_state == "data": + elif name.startswith("data.tar"): if state != "control_file": raise ValueError("missing control file") - state = new_state + state = "data" + tf = decompress_tar(af, name[8:]) for name, size, hashes in get_tar_hashes(tf, hash_functions): try: name = name.decode("utf8") -- cgit v1.2.3