diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-03-25 13:00:48 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-03-25 13:00:48 +0100 |
commit | a8671a9c2ebce7d958fb1cd26a0fab7969d6902b (patch) | |
tree | 970128189ac9073f4769f50cb24c795964434b0a | |
parent | b7dad638d2eaa4d02ac8fbbdefa540a9473d6f80 (diff) | |
download | ssdeep-a8671a9c2ebce7d958fb1cd26a0fab7969d6902b.tar.gz |
implement variants of the hashes
FUZZY_FLAG_ELIMSEQ: The comparison operation runs eliminate_sequence
before actually comparing two hashes on both of them. This step can
be moved to hash generation time using this flag. Suggested by Niels
Thykier.
FUZZY_FLAG_NOTRUNC: The second part of the hash is truncated to
SPAMSUM_LENGTH/2 by default. When comparing two hashes with
different blocksize this can result in a larger edit distance and
therefore false negatives.
-rw-r--r-- | fuzzy.c | 85 | ||||
-rw-r--r-- | fuzzy.h | 17 | ||||
-rw-r--r-- | pyfuzzy.pyx | 11 |
3 files changed, 86 insertions, 27 deletions
@@ -213,14 +213,31 @@ int fuzzy_update(struct fuzzy_state *self, const unsigned char *buffer, return 0; } +static int memcpy_eliminate_sequences(char *dst, const char *src, + int n) { + const char *srcend = src + n; + assert(n >= 0); + if(src < srcend) *dst++ = *src++; + if(src < srcend) *dst++ = *src++; + if(src < srcend) *dst++ = *src++; + while(src < srcend) + if(*src == dst[-1] && *src == dst[-2] && *src == dst[-3]) { + ++src; + --n; + } else + *dst++ = *src++; + return n; +} + #ifdef S_SPLINT_S extern const int EOVERFLOW; #endif -int fuzzy_digest(const struct fuzzy_state *self, /*@out@*/ char *result) { +int fuzzy_digest(const struct fuzzy_state *self, /*@out@*/ char *result, + unsigned int flags) { unsigned int bi = self->bhstart; uint32_t h = roll_sum(&self->roll); - int i, remain = FUZZY_MAX_RESULT - 1; + int i, remain = FUZZY_MAX_RESULT - 1; /* Exclude terminating '\0'. */ /* Verify that our elimination was not overeager. */ assert(bi == 0 || (size_t)SSDEEP_BS(bi) / 2 * SPAMSUM_LENGTH < self->total_size); @@ -249,36 +266,60 @@ int fuzzy_digest(const struct fuzzy_state *self, /*@out@*/ char *result) { remain -= i; result += i; i = (int)self->bh[bi].dlen; - if(i > remain) - i = remain; - memcpy(result, self->bh[bi].digest, (size_t)i); + assert(i <= remain); + if((flags & FUZZY_FLAG_ELIMSEQ) != 0) + i = memcpy_eliminate_sequences(result, self->bh[bi].digest, i); + else + memcpy(result, self->bh[bi].digest, (size_t)i); result += i; remain -= i; - if(remain > 0 && h != 0) { - *result++ = b64[self->bh[bi].h % 64]; - --remain; - } - if(remain > 0) { - *result++ = ':'; - --remain; + if(h != 0) { + assert(remain > 0); + *result = b64[self->bh[bi].h % 64]; + if((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 || + *result != result[-1] || + *result != result[-2] || + *result != result[-3]) { + ++result; + --remain; + } } + assert(remain > 0); + *result++ = ':'; + --remain; if(bi < self->bhend - 1) { ++bi; i = (int)self->bh[bi].dlen; - if(i > SPAMSUM_LENGTH / 2 - 1) + if((flags & FUZZY_FLAG_NOTRUNC) == 0 && + i > SPAMSUM_LENGTH / 2 - 1) i = SPAMSUM_LENGTH / 2 - 1; - if(i > remain) - i = remain; - memcpy(result, self->bh[bi].digest, (size_t)i); + assert(i <= remain); + if((flags & FUZZY_FLAG_ELIMSEQ) != 0) + i = memcpy_eliminate_sequences(result, + self->bh[bi].digest, i); + else + memcpy(result, self->bh[bi].digest, (size_t)i); result += i; remain -= i; - if(remain > 0 && h != 0) { - *result++ = b64[self->bh[bi].halfh % 64]; - --remain; + if(h != 0) { + assert(remain > 0); + h = (flags & FUZZY_FLAG_NOTRUNC) != 0 ? self->bh[bi].h : + self->bh[bi].halfh; + *result = b64[h % 64]; + if((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 || + *result != result[-1] || + *result != result[-2] || + *result != result[-3]) { + ++result; + --remain; + } } - } else if(remain > 0 && h != 0) { + } else if(h != 0) { assert(self->bh[bi].dlen == 0); + assert(remain > 0); *result++ = b64[self->bh[bi].h % 64]; + /* No need to bother with FUZZY_FLAG_ELIMSEQ, because this + * digest has length 1. */ --remain; } *result = '\0'; @@ -297,7 +338,7 @@ int fuzzy_hash_buf(const unsigned char *buf, uint32_t buf_len, return -1; if(fuzzy_update(ctx, buf, buf_len) < 0) goto out; - if(fuzzy_digest(ctx, result) < 0) + if(fuzzy_digest(ctx, result, 0) < 0) goto out; ret = 0; out: @@ -321,7 +362,7 @@ int fuzzy_hash_stream(FILE *handle, /*@out@*/ char *result) { } if(ferror(handle) != 0) goto out; - if(fuzzy_digest(ctx, result) < 0) + if(fuzzy_digest(ctx, result, 0) < 0) goto out; ret = 0; out: @@ -30,6 +30,17 @@ extern "C" { #ifndef FUZZY_H #define FUZZY_H +/** + * @brief fuzzy_digest flag indicating to eliminate sequences of more than + * three identical characters + */ +#define FUZZY_FLAG_ELIMSEQ 0x1u +/** + * @brief fuzzy_digest flag indicating not to truncate the second part to + * SPAMSUM_LENGTH/2 characters. + */ +#define FUZZY_FLAG_NOTRUNC 0x2u + struct fuzzy_state; /** @@ -60,10 +71,12 @@ extern int fuzzy_update(struct fuzzy_state *state, const unsigned char *buffer, * concatenation of the data previously fed using fuzzy_update. * @param result Where the fuzzy hash is stored. This variable * must be allocated to hold at least FUZZY_MAX_RESULT bytes. + * @param flags is a bitwise or of FUZZY_FLAG_* macros. The absence of flags is + * represented by a zero. * @return zero on success, non-zero on error */ extern int fuzzy_digest(const struct fuzzy_state *state, - /*@out@*/ char *result); + /*@out@*/ char *result, unsigned int flags); /** * @brief Dispose a fuzzy state. @@ -136,7 +149,7 @@ extern int fuzzy_hash_filename(const char *filename, /*@out@*/ char * result); /** The longest possible length for a fuzzy hash signature * (without the filename) */ -#define FUZZY_MAX_RESULT (SPAMSUM_LENGTH + (SPAMSUM_LENGTH/2 + 20)) +#define FUZZY_MAX_RESULT (2 * SPAMSUM_LENGTH + 20) #ifdef __cplusplus } diff --git a/pyfuzzy.pyx b/pyfuzzy.pyx index 2a18786..e9d870e 100644 --- a/pyfuzzy.pyx +++ b/pyfuzzy.pyx @@ -21,10 +21,13 @@ import os cdef extern from "fuzzy.h": struct fuzzy_state cdef enum: + FUZZY_FLAG_ELIMSEQ + FUZZY_FLAG_NOTRUNC FUZZY_MAX_RESULT cdef extern fuzzy_state *fuzzy_new() nogil cdef extern int fuzzy_update(fuzzy_state *, unsigned char *, size_t) nogil - cdef extern int fuzzy_digest(fuzzy_state *, char *) nogil + cdef extern int fuzzy_digest(fuzzy_state *, char *, + unsigned int flags) nogil cdef extern void fuzzy_free(fuzzy_state *) nogil class FuzzyError(Exception): @@ -56,11 +59,13 @@ cdef class FuzzyHash: self.state = NULL raise FuzzyError(libc.errno.errno) - def digest(self): + def digest(self, elimseq=False, notrunc=False): if self.state == NULL: raise FuzzyError(libc.errno.EINVAL) cdef char result[FUZZY_MAX_RESULT] - if fuzzy_digest(self.state, result) != 0: + flags = (FUZZY_FLAG_ELIMSEQ if elimseq else 0) | \ + (FUZZY_FLAG_NOTRUNC if notrunc else 0) + if fuzzy_digest(self.state, result, flags) != 0: raise FuzzyError(libc.errno.errno) return str(result) |