diff options
author | Helmut Grohne <helmut@subdivi.de> | 2014-02-21 22:06:24 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2014-02-21 22:06:24 +0100 |
commit | b602935b6a963dfed262b8e0f71f8593cf2fae16 (patch) | |
tree | c67edab3e63078ae92f5d13e4b5469c7e452903a | |
parent | f07c4642083e19c90a180b7bc407d54f341a1909 (diff) | |
download | debian-dedup-morehashes.tar.gz |
add a "hash" function computing the ratio of printable charactersmorehashes
-rw-r--r-- | dedup/hashing.py | 26 |
1 files changed, 26 insertions, 0 deletions
diff --git a/dedup/hashing.py b/dedup/hashing.py index 70f6268..1061c00 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,4 +1,5 @@ import itertools +import string class HashBlacklist(object): """Turn a hashlib-like object into a hash that returns None for some @@ -179,3 +180,28 @@ class HashedStream(object): def hexdigest(self): return self.hashobj.hexdigest() + +class RateBinary(object): + """Compute the ratio of printable characters the total number of + characters. Not a hash really.""" + printable = set(string.printable) + name = "ratebinary" + + def __init__(self): + self.length = 0 + self.good = 0 + + def update(self, data): + self.length += len(data) + printable = self.printable + self.good += len([c for c in data if c in printable]) + + def hexdigest(self): + if not self.length: + return "NaN" + return "%0.3f" % (float(self.good) / float(self.length)) + + def copy(self): + new = RateBinary() + new.length = self.length + new.good = self.good |