From 17597b5e828f9bbc9b0159102b173c284c23a140 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 19 Feb 2014 07:54:21 +0100 Subject: DecompressedHash should fail on trailing input Otherwise all files smaller than 10 bytes are successfully hashed to the hash of the empty input when using the GzipDecompressor. Reported-By: Olly Betts --- dedup/hashing.py | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'dedup/hashing.py') diff --git a/dedup/hashing.py b/dedup/hashing.py index 002eda8..5f015b2 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -49,9 +49,13 @@ class DecompressedHash(object): def hexdigest(self): if not hasattr(self.decompressor, "flush"): + if self.decompressor.unused_data: + raise ValueError("decompressor did not consume all data") return self.hashobj.hexdigest() tmpdecomp = self.decompressor.copy() data = tmpdecomp.flush() + if tmpdecomp.unused_data: + raise ValueError("decompressor did not consume all data") tmphash = self.hashobj.copy() tmphash.update(data) return tmphash.hexdigest() -- cgit v1.2.3 From 332ac9eafb235443f163c606ced95dcbd615815e Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 19 Feb 2014 14:21:20 +0100 Subject: blacklist content rather than hashes Otherwise the gzip hash cannot tell the empty stream and the compressed empty stream apart. --- dedup/hashing.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ importpkg.py | 15 +++++---------- 2 files changed, 55 insertions(+), 10 deletions(-) (limited to 'dedup/hashing.py') diff --git a/dedup/hashing.py b/dedup/hashing.py index 5f015b2..70f6268 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,3 +1,5 @@ +import itertools + class HashBlacklist(object): """Turn a hashlib-like object into a hash that returns None for some blacklisted hashes instead of the real hash value. @@ -29,6 +31,54 @@ class HashBlacklist(object): def copy(self): return HashBlacklist(self.hashobj.copy(), self.blacklist) +class HashBlacklistContent(object): + """Turn a hashlib-like object into a hash that returns None for some + blacklisted content instead of the real hash value. Unlike HashBlacklist, + not the output of the hash is considered, but its input.""" + + def __init__(self, hashobj, blacklist=(), maxlen=None): + """ + @param hashobj: a hashlib-like object + @param blacklist: an object providing __contains__. + hash inputs which are contained in the blacklist + are turned into None values + @param maxlen: the maximum length of a blacklisted input. + Defaults to max(map(len, blacklist)), so if it is absent, + the blacklist must support iteration. + """ + self.hashobj = hashobj + self.blacklist = blacklist + if maxlen is None: + # the chain avoids passing the empty sequence to max + maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist))) + self.maxlen = maxlen + self.stored = "" + + @property + def name(self): + return self.hashobj.name + + def update(self, data): + if self.stored is not None: + self.stored += data + if len(self.stored) > self.maxlen: + self.stored = None + self.hashobj.update(data) + + def digest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.digest() + + def hexdigest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.hexdigest() + + def copy(self): + return HashBlacklistContent(self.hashobj.copy(), self.blacklist, + self.maxlen) + class DecompressedHash(object): """Apply a decompression function before the hash. This class provides the hashlib interface (update, hexdigest, copy) excluding digest and name.""" diff --git a/importpkg.py b/importpkg.py index 54f6181..cb16f97 100755 --- a/importpkg.py +++ b/importpkg.py @@ -16,26 +16,21 @@ import yaml from dedup.arreader import ArReader from dedup.debpkg import process_control, get_tar_hashes -from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ - HashedStream +from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ + HashBlacklistContent from dedup.compression import GzipDecompressor, DecompressedStream from dedup.image import GIFHash, PNGHash -boring_sha512_hashes = set(( - # "" - "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", - # "\n" - "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09")) +boring_content = set(("", "\n")) def sha512_nontrivial(): - return HashBlacklist(hashlib.sha512(), boring_sha512_hashes) + return HashBlacklistContent(hashlib.sha512(), boring_content) def gziphash(): hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) hashobj.name = "gzip_sha512" - # don't blacklist boring hashes for gzip to get gzip issues right - return hashobj + return HashBlacklistContent(hashobj, boring_content) def pnghash(): hashobj = PNGHash(hashlib.sha512()) -- cgit v1.2.3 From 8d4c5512edbdcdd1063a7e6508f398a5a57981be Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 23 Feb 2014 18:19:35 +0100 Subject: spell check comments --- dedup/hashing.py | 2 +- dedup/image.py | 2 +- importpkg.py | 4 ++-- webapp.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'dedup/hashing.py') diff --git a/dedup/hashing.py b/dedup/hashing.py index 70f6268..a8a46c7 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -115,7 +115,7 @@ class DecompressedHash(object): class SuppressingHash(object): """A hash that silences exceptions from the update and hexdigest methods of - a hashlib-like object. If an exception has occured, hexdigest always + a hashlib-like object. If an exception has occurred, hexdigest always returns None.""" def __init__(self, hashobj, exceptions=()): """ diff --git a/dedup/image.py b/dedup/image.py index c1f2de0..ef17989 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -4,7 +4,7 @@ import struct import PIL.Image class ImageHash(object): - """A hash on the contents of an image datat type supported by PIL. This + """A hash on the contents of an image data type supported by PIL. This disregards mode, depth and meta information. Note that due to limitations in PIL and the image format (interlacing) the full contents are stored and decoded in hexdigest.""" diff --git a/importpkg.py b/importpkg.py index cb16f97..aeccda5 100755 --- a/importpkg.py +++ b/importpkg.py @@ -1,7 +1,7 @@ #!/usr/bin/python -"""This tool reads a debian package from stdin and emits a yaml stream on +"""This tool reads a Debian package from stdin and emits a yaml stream on stdout. It does not access a database. Therefore it can be run in parallel and -on multiple machines. The generated yaml conatins multiple documents. The first +on multiple machines. The generated yaml contains multiple documents. The first document contains package metadata. Then a document is emitted for each file. And finally a document consisting of the string "commit" is emitted.""" diff --git a/webapp.py b/webapp.py index fd6d685..2fd69bb 100755 --- a/webapp.py +++ b/webapp.py @@ -151,7 +151,7 @@ class Application(object): return html_response(package_template.render(params)) def compute_comparison(self, pid1, pid2): - """Compute a sequence of comparison objects ordery by the size of the + """Compute a sequence of comparison objects ordered by the size of the object in the first package. Each element of the sequence is a dict defining the following keys: * filenames: A set of filenames in package 1 (pid1) all referring to -- cgit v1.2.3