From 852c5c03e96b5c72dae157375a1fa36045a881d4 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 21 Feb 2013 17:10:54 +0100 Subject: move hashing functions to module dedup.hashing --- dedup/__init__.py | 0 dedup/__init__.pyc | Bin 0 -> 106 bytes dedup/hashing.py | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++ dedup/hashing.pyc | Bin 0 -> 4886 bytes 4 files changed, 108 insertions(+) create mode 100644 dedup/__init__.py create mode 100644 dedup/__init__.pyc create mode 100644 dedup/hashing.py create mode 100644 dedup/hashing.pyc (limited to 'dedup') diff --git a/dedup/__init__.py b/dedup/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dedup/__init__.pyc b/dedup/__init__.pyc new file mode 100644 index 0000000..8f09308 Binary files /dev/null and b/dedup/__init__.pyc differ diff --git a/dedup/hashing.py b/dedup/hashing.py new file mode 100644 index 0000000..1283c7e --- /dev/null +++ b/dedup/hashing.py @@ -0,0 +1,108 @@ +class HashBlacklist(object): + """Turn a hashlib-like object into a hash that returns None for some + blacklisted hashes instead of the real hash value. + + We only work with hexdigests here, so diget() disappears. The methods + copy and update as well as the name attribute keep working as expected. + """ + def __init__(self, hashobj, blacklist=()): + """ + @param hashobj: a hashlib-like object + @param blacklist: an object providing __contains__. + hexdigest values which are contained in the blacklist + are turned into None values + """ + self.hashobj = hashobj + self.blacklist = blacklist + self.update = self.hashobj.update + + @property + def name(self): + return self.hashobj.name + + def hexdigest(self): + digest = self.hashobj.hexdigest() + if digest in self.blacklist: + return None + return digest + + def copy(self): + return HashBlacklist(self.hashobj.copy(), self.blacklist) + +class DecompressedHash(object): + """Apply a decompression function before the hash. This class provides the + hashlib interface (update, hexdigest, copy) excluding digest and name.""" + def __init__(self, decompressor, hashobj): + """ + @param decompressor: a decompression object like bz2.BZ2Decompressor or + lzma.LZMADecompressor. It has to provide methods decompress and + copy as well as an unused_data attribute. It may provide a flush + method. + @param hashobj: a hashlib-like obj providing methods update, hexdigest + and copy + """ + self.decompressor = decompressor + self.hashobj = hashobj + + def update(self, data): + self.hashobj.update(self.decompressor.decompress(data)) + + def hexdigest(self): + if not hasattr(self.decompressor, "flush"): + return self.hashobj.hexdigest() + tmpdecomp = self.decompressor.copy() + data = tmpdecomp.flush() + tmphash = self.hashobj.copy() + tmphash.update(data) + return tmphash.hexdigest() + + def copy(self): + return DecompressedHash(self.decompressor.copy(), self.hashobj.copy()) + +class SuppressingHash(object): + """A hash that silences exceptions from the update and hexdigest methods of + a hashlib-like object. If an exception has occured, hexdigest always + returns None.""" + def __init__(self, hashobj, exceptions=()): + """ + @param hashobj: a hashlib-like object providing methods update, copy + and hexdigest. If a name attribute is present, it is mirrored as + well. + @type exceptions: tuple + @param exceptions: exception classes to be suppressed + """ + self.hashobj = hashobj + self.exceptions = exceptions + if hasattr(hashobj, "name"): + self.name = hashobj.name + + def update(self, data): + if self.hashobj: + try: + self.hashobj.update(data) + except self.exceptions: + self.hashobj = None + + def hexdigest(self): + if self.hashobj: + try: + return self.hashobj.hexdigest() + except self.exceptions: + self.hashobj = None + return None + + def copy(self): + if self.hashobj: + return SuppressingHash(self.hashobj.copy(), self.exceptions) + return SuppressingHash(None, self.exceptions) + +def hash_file(hashobj, filelike, blocksize=65536): + """Feed the entire contents from the given filelike to the given hashobj. + @param hashobj: hashlib-like object providing an update method + @param filelike: file-like object providing read(size) + """ + data = filelike.read(blocksize) + while data: + hashobj.update(data) + data = filelike.read(blocksize) + return hashobj diff --git a/dedup/hashing.pyc b/dedup/hashing.pyc new file mode 100644 index 0000000..7d2383a Binary files /dev/null and b/dedup/hashing.pyc differ -- cgit v1.2.3