diff options
Diffstat (limited to 'dedup/hashing.py')
-rw-r--r-- | dedup/hashing.py | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/dedup/hashing.py b/dedup/hashing.py index 1283c7e..0c786e1 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,3 +1,6 @@ +import io +import os + class HashBlacklist(object): """Turn a hashlib-like object into a hash that returns None for some blacklisted hashes instead of the real hash value. @@ -96,6 +99,63 @@ class SuppressingHash(object): return SuppressingHash(self.hashobj.copy(), self.exceptions) return SuppressingHash(None, self.exceptions) +class StoredHash(object): + """A hash that stores all the updates and then does all the work on the + hexdigest call.""" + + def __init__(self, digestfunc, acceptfunc=lambda _: True, + sizelimit=1024*1024*16): + """ + @type digestfunc: file-like -> str + @param digestfunc: should read the given file-like and return + the computed hash. The file-like can be assumed to be seekable. + @type acceptfunc: file-like -> bool or None + should read enough of file-like to determine whether the hash + is computable. To accept the hash, return True. To reject + producing a hash return False. To defer the decision until more + data is available return None. + @type sizelimit: int + @param sizelimit: when the content exceeds this size, reject it + """ + self.digestfunc = digestfunc + self.acceptfunc = acceptfunc + self.sizelimit = sizelimit + self.content = io.BytesIO() + self.accepted = False + + def update(self, data): + if self.content is None or not data: + return + self.content.seek(0, os.SEEK_END) + if self.content.tell() + len(data) > self.sizelimit: + self.content = None + return + self.content.write(data) + if not self.accepted: + self.content.seek(0, os.SEEK_SET) + ret = self.acceptfunc(self.content) + if ret is None: + return + if ret: + self.accepted = True + else: + self.content = None + + def hexdigest(self): + if not self.content or not self.accepted: + return None + self.content.seek(0, os.SEEK_SET) + return self.digestfunc(self.content) + + def copy(self): + new = StoredHash(self.digestfunc, self.acceptfunc, self.sizelimit) + if self.content: + new.content = io.BytesIO(self.content.getvalue()) + else: + new.content = None + new.accepted = self.accepted + return new + def hash_file(hashobj, filelike, blocksize=65536): """Feed the entire contents from the given filelike to the given hashobj. @param hashobj: hashlib-like object providing an update method |