From b602935b6a963dfed262b8e0f71f8593cf2fae16 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Fri, 21 Feb 2014 22:06:24 +0100 Subject: add a "hash" function computing the ratio of printable characters --- dedup/hashing.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/dedup/hashing.py b/dedup/hashing.py index 70f6268..1061c00 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,4 +1,5 @@ import itertools +import string class HashBlacklist(object): """Turn a hashlib-like object into a hash that returns None for some @@ -179,3 +180,28 @@ class HashedStream(object): def hexdigest(self): return self.hashobj.hexdigest() + +class RateBinary(object): + """Compute the ratio of printable characters the total number of + characters. Not a hash really.""" + printable = set(string.printable) + name = "ratebinary" + + def __init__(self): + self.length = 0 + self.good = 0 + + def update(self, data): + self.length += len(data) + printable = self.printable + self.good += len([c for c in data if c in printable]) + + def hexdigest(self): + if not self.length: + return "NaN" + return "%0.3f" % (float(self.good) / float(self.length)) + + def copy(self): + new = RateBinary() + new.length = self.length + new.good = self.good -- cgit v1.2.3