From a8671a9c2ebce7d958fb1cd26a0fab7969d6902b Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Mon, 25 Mar 2013 13:00:48 +0100 Subject: implement variants of the hashes FUZZY_FLAG_ELIMSEQ: The comparison operation runs eliminate_sequence before actually comparing two hashes on both of them. This step can be moved to hash generation time using this flag. Suggested by Niels Thykier. FUZZY_FLAG_NOTRUNC: The second part of the hash is truncated to SPAMSUM_LENGTH/2 by default. When comparing two hashes with different blocksize this can result in a larger edit distance and therefore false negatives. --- pyfuzzy.pyx | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'pyfuzzy.pyx') diff --git a/pyfuzzy.pyx b/pyfuzzy.pyx index 2a18786..e9d870e 100644 --- a/pyfuzzy.pyx +++ b/pyfuzzy.pyx @@ -21,10 +21,13 @@ import os cdef extern from "fuzzy.h": struct fuzzy_state cdef enum: + FUZZY_FLAG_ELIMSEQ + FUZZY_FLAG_NOTRUNC FUZZY_MAX_RESULT cdef extern fuzzy_state *fuzzy_new() nogil cdef extern int fuzzy_update(fuzzy_state *, unsigned char *, size_t) nogil - cdef extern int fuzzy_digest(fuzzy_state *, char *) nogil + cdef extern int fuzzy_digest(fuzzy_state *, char *, + unsigned int flags) nogil cdef extern void fuzzy_free(fuzzy_state *) nogil class FuzzyError(Exception): @@ -56,11 +59,13 @@ cdef class FuzzyHash: self.state = NULL raise FuzzyError(libc.errno.errno) - def digest(self): + def digest(self, elimseq=False, notrunc=False): if self.state == NULL: raise FuzzyError(libc.errno.EINVAL) cdef char result[FUZZY_MAX_RESULT] - if fuzzy_digest(self.state, result) != 0: + flags = (FUZZY_FLAG_ELIMSEQ if elimseq else 0) | \ + (FUZZY_FLAG_NOTRUNC if notrunc else 0) + if fuzzy_digest(self.state, result, flags) != 0: raise FuzzyError(libc.errno.errno) return str(result) -- cgit v1.2.3