summaryrefslogtreecommitdiff
path: root/dedup/hashing.py
diff options
context:
space:
mode:
Diffstat (limited to 'dedup/hashing.py')
-rw-r--r--dedup/hashing.py50
1 files changed, 50 insertions, 0 deletions
diff --git a/dedup/hashing.py b/dedup/hashing.py
index 5f015b2..70f6268 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -1,3 +1,5 @@
+import itertools
+
class HashBlacklist(object):
"""Turn a hashlib-like object into a hash that returns None for some
blacklisted hashes instead of the real hash value.
@@ -29,6 +31,54 @@ class HashBlacklist(object):
def copy(self):
return HashBlacklist(self.hashobj.copy(), self.blacklist)
+class HashBlacklistContent(object):
+ """Turn a hashlib-like object into a hash that returns None for some
+ blacklisted content instead of the real hash value. Unlike HashBlacklist,
+ not the output of the hash is considered, but its input."""
+
+ def __init__(self, hashobj, blacklist=(), maxlen=None):
+ """
+ @param hashobj: a hashlib-like object
+ @param blacklist: an object providing __contains__.
+ hash inputs which are contained in the blacklist
+ are turned into None values
+ @param maxlen: the maximum length of a blacklisted input.
+ Defaults to max(map(len, blacklist)), so if it is absent,
+ the blacklist must support iteration.
+ """
+ self.hashobj = hashobj
+ self.blacklist = blacklist
+ if maxlen is None:
+ # the chain avoids passing the empty sequence to max
+ maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist)))
+ self.maxlen = maxlen
+ self.stored = ""
+
+ @property
+ def name(self):
+ return self.hashobj.name
+
+ def update(self, data):
+ if self.stored is not None:
+ self.stored += data
+ if len(self.stored) > self.maxlen:
+ self.stored = None
+ self.hashobj.update(data)
+
+ def digest(self):
+ if self.stored is not None and self.stored in self.blacklist:
+ return None
+ return self.hashobj.digest()
+
+ def hexdigest(self):
+ if self.stored is not None and self.stored in self.blacklist:
+ return None
+ return self.hashobj.hexdigest()
+
+ def copy(self):
+ return HashBlacklistContent(self.hashobj.copy(), self.blacklist,
+ self.maxlen)
+
class DecompressedHash(object):
"""Apply a decompression function before the hash. This class provides the
hashlib interface (update, hexdigest, copy) excluding digest and name."""