summaryrefslogtreecommitdiff
path: root/dedup/hashing.py
diff options
context:
space:
mode:
Diffstat (limited to 'dedup/hashing.py')
-rw-r--r--dedup/hashing.py26
1 files changed, 26 insertions, 0 deletions
diff --git a/dedup/hashing.py b/dedup/hashing.py
index 70f6268..1061c00 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -1,4 +1,5 @@
import itertools
+import string
class HashBlacklist(object):
"""Turn a hashlib-like object into a hash that returns None for some
@@ -179,3 +180,28 @@ class HashedStream(object):
def hexdigest(self):
return self.hashobj.hexdigest()
+
+class RateBinary(object):
+ """Compute the ratio of printable characters the total number of
+ characters. Not a hash really."""
+ printable = set(string.printable)
+ name = "ratebinary"
+
+ def __init__(self):
+ self.length = 0
+ self.good = 0
+
+ def update(self, data):
+ self.length += len(data)
+ printable = self.printable
+ self.good += len([c for c in data if c in printable])
+
+ def hexdigest(self):
+ if not self.length:
+ return "NaN"
+ return "%0.3f" % (float(self.good) / float(self.length))
+
+ def copy(self):
+ new = RateBinary()
+ new.length = self.length
+ new.good = self.good