summaryrefslogtreecommitdiff
path: root/dedup/hashing.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-03-08 16:33:09 +0100
committerHelmut Grohne <helmut@subdivi.de>2013-03-08 16:33:09 +0100
commit0e690a1f5e32d1e16ad27dd96cb43b78d5d36fb4 (patch)
treead0e188b62806ecc7d2d806f1f8958b0a1fd8bbf /dedup/hashing.py
parent5b5cf7f2629c3a6c78f6057ff1e8476ff001409f (diff)
downloaddebian-dedup-0e690a1f5e32d1e16ad27dd96cb43b78d5d36fb4.tar.gz
generalize ImageHash to StoredHash
Diffstat (limited to 'dedup/hashing.py')
-rw-r--r--dedup/hashing.py60
1 files changed, 60 insertions, 0 deletions
diff --git a/dedup/hashing.py b/dedup/hashing.py
index 1283c7e..0c786e1 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -1,3 +1,6 @@
+import io
+import os
+
class HashBlacklist(object):
"""Turn a hashlib-like object into a hash that returns None for some
blacklisted hashes instead of the real hash value.
@@ -96,6 +99,63 @@ class SuppressingHash(object):
return SuppressingHash(self.hashobj.copy(), self.exceptions)
return SuppressingHash(None, self.exceptions)
+class StoredHash(object):
+ """A hash that stores all the updates and then does all the work on the
+ hexdigest call."""
+
+ def __init__(self, digestfunc, acceptfunc=lambda _: True,
+ sizelimit=1024*1024*16):
+ """
+ @type digestfunc: file-like -> str
+ @param digestfunc: should read the given file-like and return
+ the computed hash. The file-like can be assumed to be seekable.
+ @type acceptfunc: file-like -> bool or None
+ should read enough of file-like to determine whether the hash
+ is computable. To accept the hash, return True. To reject
+ producing a hash return False. To defer the decision until more
+ data is available return None.
+ @type sizelimit: int
+ @param sizelimit: when the content exceeds this size, reject it
+ """
+ self.digestfunc = digestfunc
+ self.acceptfunc = acceptfunc
+ self.sizelimit = sizelimit
+ self.content = io.BytesIO()
+ self.accepted = False
+
+ def update(self, data):
+ if self.content is None or not data:
+ return
+ self.content.seek(0, os.SEEK_END)
+ if self.content.tell() + len(data) > self.sizelimit:
+ self.content = None
+ return
+ self.content.write(data)
+ if not self.accepted:
+ self.content.seek(0, os.SEEK_SET)
+ ret = self.acceptfunc(self.content)
+ if ret is None:
+ return
+ if ret:
+ self.accepted = True
+ else:
+ self.content = None
+
+ def hexdigest(self):
+ if not self.content or not self.accepted:
+ return None
+ self.content.seek(0, os.SEEK_SET)
+ return self.digestfunc(self.content)
+
+ def copy(self):
+ new = StoredHash(self.digestfunc, self.acceptfunc, self.sizelimit)
+ if self.content:
+ new.content = io.BytesIO(self.content.getvalue())
+ else:
+ new.content = None
+ new.accepted = self.accepted
+ return new
+
def hash_file(hashobj, filelike, blocksize=65536):
"""Feed the entire contents from the given filelike to the given hashobj.
@param hashobj: hashlib-like object providing an update method