summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2014-02-19 14:21:20 +0100
committerHelmut Grohne <helmut@subdivi.de>2014-02-19 14:21:20 +0100
commit332ac9eafb235443f163c606ced95dcbd615815e (patch)
tree00a3071fe331f4ed52ac058bf3616e55b24bed69
parentd467a2a4e85d4b6f09bd2e3dc70466bfcc45a577 (diff)
downloaddebian-dedup-332ac9eafb235443f163c606ced95dcbd615815e.tar.gz
blacklist content rather than hashes
Otherwise the gzip hash cannot tell the empty stream and the compressed empty stream apart.
-rw-r--r--dedup/hashing.py50
-rwxr-xr-ximportpkg.py15
2 files changed, 55 insertions, 10 deletions
diff --git a/dedup/hashing.py b/dedup/hashing.py
index 5f015b2..70f6268 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -1,3 +1,5 @@
+import itertools
+
class HashBlacklist(object):
"""Turn a hashlib-like object into a hash that returns None for some
blacklisted hashes instead of the real hash value.
@@ -29,6 +31,54 @@ class HashBlacklist(object):
def copy(self):
return HashBlacklist(self.hashobj.copy(), self.blacklist)
+class HashBlacklistContent(object):
+ """Turn a hashlib-like object into a hash that returns None for some
+ blacklisted content instead of the real hash value. Unlike HashBlacklist,
+ not the output of the hash is considered, but its input."""
+
+ def __init__(self, hashobj, blacklist=(), maxlen=None):
+ """
+ @param hashobj: a hashlib-like object
+ @param blacklist: an object providing __contains__.
+ hash inputs which are contained in the blacklist
+ are turned into None values
+ @param maxlen: the maximum length of a blacklisted input.
+ Defaults to max(map(len, blacklist)), so if it is absent,
+ the blacklist must support iteration.
+ """
+ self.hashobj = hashobj
+ self.blacklist = blacklist
+ if maxlen is None:
+ # the chain avoids passing the empty sequence to max
+ maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist)))
+ self.maxlen = maxlen
+ self.stored = ""
+
+ @property
+ def name(self):
+ return self.hashobj.name
+
+ def update(self, data):
+ if self.stored is not None:
+ self.stored += data
+ if len(self.stored) > self.maxlen:
+ self.stored = None
+ self.hashobj.update(data)
+
+ def digest(self):
+ if self.stored is not None and self.stored in self.blacklist:
+ return None
+ return self.hashobj.digest()
+
+ def hexdigest(self):
+ if self.stored is not None and self.stored in self.blacklist:
+ return None
+ return self.hashobj.hexdigest()
+
+ def copy(self):
+ return HashBlacklistContent(self.hashobj.copy(), self.blacklist,
+ self.maxlen)
+
class DecompressedHash(object):
"""Apply a decompression function before the hash. This class provides the
hashlib interface (update, hexdigest, copy) excluding digest and name."""
diff --git a/importpkg.py b/importpkg.py
index 54f6181..cb16f97 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -16,26 +16,21 @@ import yaml
from dedup.arreader import ArReader
from dedup.debpkg import process_control, get_tar_hashes
-from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
- HashedStream
+from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \
+ HashBlacklistContent
from dedup.compression import GzipDecompressor, DecompressedStream
from dedup.image import GIFHash, PNGHash
-boring_sha512_hashes = set((
- # ""
- "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e",
- # "\n"
- "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09"))
+boring_content = set(("", "\n"))
def sha512_nontrivial():
- return HashBlacklist(hashlib.sha512(), boring_sha512_hashes)
+ return HashBlacklistContent(hashlib.sha512(), boring_content)
def gziphash():
hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512())
hashobj = SuppressingHash(hashobj, (ValueError, zlib.error))
hashobj.name = "gzip_sha512"
- # don't blacklist boring hashes for gzip to get gzip issues right
- return hashobj
+ return HashBlacklistContent(hashobj, boring_content)
def pnghash():
hashobj = PNGHash(hashlib.sha512())