diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-03-25 13:00:48 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-03-25 13:00:48 +0100 |
commit | a8671a9c2ebce7d958fb1cd26a0fab7969d6902b (patch) | |
tree | 970128189ac9073f4769f50cb24c795964434b0a /pyfuzzy.pyx | |
parent | b7dad638d2eaa4d02ac8fbbdefa540a9473d6f80 (diff) | |
download | ssdeep-a8671a9c2ebce7d958fb1cd26a0fab7969d6902b.tar.gz |
implement variants of the hashes
FUZZY_FLAG_ELIMSEQ: The comparison operation runs eliminate_sequence
before actually comparing two hashes on both of them. This step can
be moved to hash generation time using this flag. Suggested by Niels
Thykier.
FUZZY_FLAG_NOTRUNC: The second part of the hash is truncated to
SPAMSUM_LENGTH/2 by default. When comparing two hashes with
different blocksize this can result in a larger edit distance and
therefore false negatives.
Diffstat (limited to 'pyfuzzy.pyx')
-rw-r--r-- | pyfuzzy.pyx | 11 |
1 files changed, 8 insertions, 3 deletions
diff --git a/pyfuzzy.pyx b/pyfuzzy.pyx index 2a18786..e9d870e 100644 --- a/pyfuzzy.pyx +++ b/pyfuzzy.pyx @@ -21,10 +21,13 @@ import os cdef extern from "fuzzy.h": struct fuzzy_state cdef enum: + FUZZY_FLAG_ELIMSEQ + FUZZY_FLAG_NOTRUNC FUZZY_MAX_RESULT cdef extern fuzzy_state *fuzzy_new() nogil cdef extern int fuzzy_update(fuzzy_state *, unsigned char *, size_t) nogil - cdef extern int fuzzy_digest(fuzzy_state *, char *) nogil + cdef extern int fuzzy_digest(fuzzy_state *, char *, + unsigned int flags) nogil cdef extern void fuzzy_free(fuzzy_state *) nogil class FuzzyError(Exception): @@ -56,11 +59,13 @@ cdef class FuzzyHash: self.state = NULL raise FuzzyError(libc.errno.errno) - def digest(self): + def digest(self, elimseq=False, notrunc=False): if self.state == NULL: raise FuzzyError(libc.errno.EINVAL) cdef char result[FUZZY_MAX_RESULT] - if fuzzy_digest(self.state, result) != 0: + flags = (FUZZY_FLAG_ELIMSEQ if elimseq else 0) | \ + (FUZZY_FLAG_NOTRUNC if notrunc else 0) + if fuzzy_digest(self.state, result, flags) != 0: raise FuzzyError(libc.errno.errno) return str(result) |