diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-03-08 16:33:09 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-03-08 16:33:09 +0100 |
commit | 0e690a1f5e32d1e16ad27dd96cb43b78d5d36fb4 (patch) | |
tree | ad0e188b62806ecc7d2d806f1f8958b0a1fd8bbf /dedup/hashing.py | |
parent | 5b5cf7f2629c3a6c78f6057ff1e8476ff001409f (diff) | |
download | debian-dedup-0e690a1f5e32d1e16ad27dd96cb43b78d5d36fb4.tar.gz |
generalize ImageHash to StoredHash
Diffstat (limited to 'dedup/hashing.py')
-rw-r--r-- | dedup/hashing.py | 60 |
1 files changed, 60 insertions, 0 deletions
diff --git a/dedup/hashing.py b/dedup/hashing.py index 1283c7e..0c786e1 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,3 +1,6 @@ +import io +import os + class HashBlacklist(object): """Turn a hashlib-like object into a hash that returns None for some blacklisted hashes instead of the real hash value. @@ -96,6 +99,63 @@ class SuppressingHash(object): return SuppressingHash(self.hashobj.copy(), self.exceptions) return SuppressingHash(None, self.exceptions) +class StoredHash(object): + """A hash that stores all the updates and then does all the work on the + hexdigest call.""" + + def __init__(self, digestfunc, acceptfunc=lambda _: True, + sizelimit=1024*1024*16): + """ + @type digestfunc: file-like -> str + @param digestfunc: should read the given file-like and return + the computed hash. The file-like can be assumed to be seekable. + @type acceptfunc: file-like -> bool or None + should read enough of file-like to determine whether the hash + is computable. To accept the hash, return True. To reject + producing a hash return False. To defer the decision until more + data is available return None. + @type sizelimit: int + @param sizelimit: when the content exceeds this size, reject it + """ + self.digestfunc = digestfunc + self.acceptfunc = acceptfunc + self.sizelimit = sizelimit + self.content = io.BytesIO() + self.accepted = False + + def update(self, data): + if self.content is None or not data: + return + self.content.seek(0, os.SEEK_END) + if self.content.tell() + len(data) > self.sizelimit: + self.content = None + return + self.content.write(data) + if not self.accepted: + self.content.seek(0, os.SEEK_SET) + ret = self.acceptfunc(self.content) + if ret is None: + return + if ret: + self.accepted = True + else: + self.content = None + + def hexdigest(self): + if not self.content or not self.accepted: + return None + self.content.seek(0, os.SEEK_SET) + return self.digestfunc(self.content) + + def copy(self): + new = StoredHash(self.digestfunc, self.acceptfunc, self.sizelimit) + if self.content: + new.content = io.BytesIO(self.content.getvalue()) + else: + new.content = None + new.accepted = self.accepted + return new + def hash_file(hashobj, filelike, blocksize=65536): """Feed the entire contents from the given filelike to the given hashobj. @param hashobj: hashlib-like object providing an update method |