From 852c5c03e96b5c72dae157375a1fa36045a881d4 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 21 Feb 2013 17:10:54 +0100 Subject: move hashing functions to module dedup.hashing --- dedup/__init__.py | 0 dedup/__init__.pyc | Bin 0 -> 106 bytes dedup/hashing.py | 108 +++++++++++++++++++++++++++++++++++++++++++++++++++++ dedup/hashing.pyc | Bin 0 -> 4886 bytes importpkg.py | 59 +---------------------------- 5 files changed, 110 insertions(+), 57 deletions(-) create mode 100644 dedup/__init__.py create mode 100644 dedup/__init__.pyc create mode 100644 dedup/hashing.py create mode 100644 dedup/hashing.pyc diff --git a/dedup/__init__.py b/dedup/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/dedup/__init__.pyc b/dedup/__init__.pyc new file mode 100644 index 0000000..8f09308 Binary files /dev/null and b/dedup/__init__.pyc differ diff --git a/dedup/hashing.py b/dedup/hashing.py new file mode 100644 index 0000000..1283c7e --- /dev/null +++ b/dedup/hashing.py @@ -0,0 +1,108 @@ +class HashBlacklist(object): + """Turn a hashlib-like object into a hash that returns None for some + blacklisted hashes instead of the real hash value. + + We only work with hexdigests here, so diget() disappears. The methods + copy and update as well as the name attribute keep working as expected. + """ + def __init__(self, hashobj, blacklist=()): + """ + @param hashobj: a hashlib-like object + @param blacklist: an object providing __contains__. + hexdigest values which are contained in the blacklist + are turned into None values + """ + self.hashobj = hashobj + self.blacklist = blacklist + self.update = self.hashobj.update + + @property + def name(self): + return self.hashobj.name + + def hexdigest(self): + digest = self.hashobj.hexdigest() + if digest in self.blacklist: + return None + return digest + + def copy(self): + return HashBlacklist(self.hashobj.copy(), self.blacklist) + +class DecompressedHash(object): + """Apply a decompression function before the hash. This class provides the + hashlib interface (update, hexdigest, copy) excluding digest and name.""" + def __init__(self, decompressor, hashobj): + """ + @param decompressor: a decompression object like bz2.BZ2Decompressor or + lzma.LZMADecompressor. It has to provide methods decompress and + copy as well as an unused_data attribute. It may provide a flush + method. + @param hashobj: a hashlib-like obj providing methods update, hexdigest + and copy + """ + self.decompressor = decompressor + self.hashobj = hashobj + + def update(self, data): + self.hashobj.update(self.decompressor.decompress(data)) + + def hexdigest(self): + if not hasattr(self.decompressor, "flush"): + return self.hashobj.hexdigest() + tmpdecomp = self.decompressor.copy() + data = tmpdecomp.flush() + tmphash = self.hashobj.copy() + tmphash.update(data) + return tmphash.hexdigest() + + def copy(self): + return DecompressedHash(self.decompressor.copy(), self.hashobj.copy()) + +class SuppressingHash(object): + """A hash that silences exceptions from the update and hexdigest methods of + a hashlib-like object. If an exception has occured, hexdigest always + returns None.""" + def __init__(self, hashobj, exceptions=()): + """ + @param hashobj: a hashlib-like object providing methods update, copy + and hexdigest. If a name attribute is present, it is mirrored as + well. + @type exceptions: tuple + @param exceptions: exception classes to be suppressed + """ + self.hashobj = hashobj + self.exceptions = exceptions + if hasattr(hashobj, "name"): + self.name = hashobj.name + + def update(self, data): + if self.hashobj: + try: + self.hashobj.update(data) + except self.exceptions: + self.hashobj = None + + def hexdigest(self): + if self.hashobj: + try: + return self.hashobj.hexdigest() + except self.exceptions: + self.hashobj = None + return None + + def copy(self): + if self.hashobj: + return SuppressingHash(self.hashobj.copy(), self.exceptions) + return SuppressingHash(None, self.exceptions) + +def hash_file(hashobj, filelike, blocksize=65536): + """Feed the entire contents from the given filelike to the given hashobj. + @param hashobj: hashlib-like object providing an update method + @param filelike: file-like object providing read(size) + """ + data = filelike.read(blocksize) + while data: + hashobj.update(data) + data = filelike.read(blocksize) + return hashobj diff --git a/dedup/hashing.pyc b/dedup/hashing.pyc new file mode 100644 index 0000000..7d2383a Binary files /dev/null and b/dedup/hashing.pyc differ diff --git a/importpkg.py b/importpkg.py index a45720a..eb3b3ec 100755 --- a/importpkg.py +++ b/importpkg.py @@ -18,6 +18,8 @@ from debian.debian_support import version_compare from debian import deb822 import lzma +from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file + class ArReader(object): global_magic = b"!\n" file_magic = b"`\n" @@ -103,19 +105,6 @@ class MultiHash(object): for hasher in self.hashes: hasher.update(data) -class HashBlacklist(object): - def __init__(self, hasher, blacklist=set()): - self.hasher = hasher - self.blacklist = blacklist - self.update = self.hasher.update - self.name = hasher.name - - def hexdigest(self): - digest = self.hasher.hexdigest() - if digest in self.blacklist: - return None - return digest - class GzipDecompressor(object): def __init__(self): self.inbuffer = b"" @@ -175,50 +164,6 @@ class GzipDecompressor(object): new.decompressor = self.decompressor.copy() return new -class DecompressedHash(object): - def __init__(self, decompressor, hashobj): - self.decompressor = decompressor - self.hashobj = hashobj - - def update(self, data): - self.hashobj.update(self.decompressor.decompress(data)) - - def hexdigest(self): - if not hasattr(self.decompressor, "flush"): - return self.hashobj.hexdigest() - tmpdecomp = self.decompressor.copy() - data = tmpdecomp.flush() - tmphash = self.hashobj.copy() - tmphash.update(data) - return tmphash.hexdigest() - -class SuppressingHash(object): - def __init__(self, hashobj, exceptions=()): - self.hashobj = hashobj - self.exceptions = exceptions - - def update(self, data): - if self.hashobj: - try: - self.hashobj.update(data) - except self.exceptions: - self.hashobj = None - - def hexdigest(self): - if self.hashobj: - try: - return self.hashobj.hexdigest() - except self.exceptions: - self.hashobj = None - return None - -def hash_file(hashobj, filelike, blocksize=65536): - data = filelike.read(blocksize) - while data: - hashobj.update(data) - data = filelike.read(blocksize) - return hashobj - boring_sha512_hashes = set(( # "" "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", -- cgit v1.2.3