summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2014-02-21 22:06:24 +0100
committerHelmut Grohne <helmut@subdivi.de>2014-02-21 22:06:24 +0100
commitb602935b6a963dfed262b8e0f71f8593cf2fae16 (patch)
treec67edab3e63078ae92f5d13e4b5469c7e452903a
parentf07c4642083e19c90a180b7bc407d54f341a1909 (diff)
downloaddebian-dedup-b602935b6a963dfed262b8e0f71f8593cf2fae16.tar.gz
add a "hash" function computing the ratio of printable charactersmorehashes
-rw-r--r--dedup/hashing.py26
1 files changed, 26 insertions, 0 deletions
diff --git a/dedup/hashing.py b/dedup/hashing.py
index 70f6268..1061c00 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -1,4 +1,5 @@
import itertools
+import string
class HashBlacklist(object):
"""Turn a hashlib-like object into a hash that returns None for some
@@ -179,3 +180,28 @@ class HashedStream(object):
def hexdigest(self):
return self.hashobj.hexdigest()
+
+class RateBinary(object):
+ """Compute the ratio of printable characters the total number of
+ characters. Not a hash really."""
+ printable = set(string.printable)
+ name = "ratebinary"
+
+ def __init__(self):
+ self.length = 0
+ self.good = 0
+
+ def update(self, data):
+ self.length += len(data)
+ printable = self.printable
+ self.good += len([c for c in data if c in printable])
+
+ def hexdigest(self):
+ if not self.length:
+ return "NaN"
+ return "%0.3f" % (float(self.good) / float(self.length))
+
+ def copy(self):
+ new = RateBinary()
+ new.length = self.length
+ new.good = self.good