diff options
author | Helmut Grohne <helmut@subdivi.de> | 2014-07-22 08:56:42 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2014-07-22 08:56:42 +0200 |
commit | 04597f25729740406775a3dff528c9774c84efd5 (patch) | |
tree | fe905fc94afbdcfad60d5aaf88886a1f10f92a8c /dedup/hashing.py | |
parent | ba9ae116e0bbb25e2df327ba48c82472ccfa2690 (diff) | |
parent | d48c3c208ee6ba54225b3eb68ce5c9f3c894bfa4 (diff) | |
download | debian-dedup-04597f25729740406775a3dff528c9774c84efd5.tar.gz |
Merge branch master into multiarch
Resolve accumulated conflicts. In particular webapp.py gained a few
non-trivial ones, such as changes in InternalRedirect or usage of
contextlib.closing.
Conflicts:
schema.sql
webapp.py
Diffstat (limited to 'dedup/hashing.py')
-rw-r--r-- | dedup/hashing.py | 56 |
1 files changed, 55 insertions, 1 deletions
diff --git a/dedup/hashing.py b/dedup/hashing.py index 002eda8..a8a46c7 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,3 +1,5 @@ +import itertools + class HashBlacklist(object): """Turn a hashlib-like object into a hash that returns None for some blacklisted hashes instead of the real hash value. @@ -29,6 +31,54 @@ class HashBlacklist(object): def copy(self): return HashBlacklist(self.hashobj.copy(), self.blacklist) +class HashBlacklistContent(object): + """Turn a hashlib-like object into a hash that returns None for some + blacklisted content instead of the real hash value. Unlike HashBlacklist, + not the output of the hash is considered, but its input.""" + + def __init__(self, hashobj, blacklist=(), maxlen=None): + """ + @param hashobj: a hashlib-like object + @param blacklist: an object providing __contains__. + hash inputs which are contained in the blacklist + are turned into None values + @param maxlen: the maximum length of a blacklisted input. + Defaults to max(map(len, blacklist)), so if it is absent, + the blacklist must support iteration. + """ + self.hashobj = hashobj + self.blacklist = blacklist + if maxlen is None: + # the chain avoids passing the empty sequence to max + maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist))) + self.maxlen = maxlen + self.stored = "" + + @property + def name(self): + return self.hashobj.name + + def update(self, data): + if self.stored is not None: + self.stored += data + if len(self.stored) > self.maxlen: + self.stored = None + self.hashobj.update(data) + + def digest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.digest() + + def hexdigest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.hexdigest() + + def copy(self): + return HashBlacklistContent(self.hashobj.copy(), self.blacklist, + self.maxlen) + class DecompressedHash(object): """Apply a decompression function before the hash. This class provides the hashlib interface (update, hexdigest, copy) excluding digest and name.""" @@ -49,9 +99,13 @@ class DecompressedHash(object): def hexdigest(self): if not hasattr(self.decompressor, "flush"): + if self.decompressor.unused_data: + raise ValueError("decompressor did not consume all data") return self.hashobj.hexdigest() tmpdecomp = self.decompressor.copy() data = tmpdecomp.flush() + if tmpdecomp.unused_data: + raise ValueError("decompressor did not consume all data") tmphash = self.hashobj.copy() tmphash.update(data) return tmphash.hexdigest() @@ -61,7 +115,7 @@ class DecompressedHash(object): class SuppressingHash(object): """A hash that silences exceptions from the update and hexdigest methods of - a hashlib-like object. If an exception has occured, hexdigest always + a hashlib-like object. If an exception has occurred, hexdigest always returns None.""" def __init__(self, hashobj, exceptions=()): """ |