summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-03-25 13:00:48 +0100
committerHelmut Grohne <helmut@subdivi.de>2013-03-25 13:00:48 +0100
commita8671a9c2ebce7d958fb1cd26a0fab7969d6902b (patch)
tree970128189ac9073f4769f50cb24c795964434b0a
parentb7dad638d2eaa4d02ac8fbbdefa540a9473d6f80 (diff)
downloadssdeep-a8671a9c2ebce7d958fb1cd26a0fab7969d6902b.tar.gz
implement variants of the hashes
FUZZY_FLAG_ELIMSEQ: The comparison operation runs eliminate_sequence before actually comparing two hashes on both of them. This step can be moved to hash generation time using this flag. Suggested by Niels Thykier. FUZZY_FLAG_NOTRUNC: The second part of the hash is truncated to SPAMSUM_LENGTH/2 by default. When comparing two hashes with different blocksize this can result in a larger edit distance and therefore false negatives.
-rw-r--r--fuzzy.c85
-rw-r--r--fuzzy.h17
-rw-r--r--pyfuzzy.pyx11
3 files changed, 86 insertions, 27 deletions
diff --git a/fuzzy.c b/fuzzy.c
index 6123341..f1dcdcf 100644
--- a/fuzzy.c
+++ b/fuzzy.c
@@ -213,14 +213,31 @@ int fuzzy_update(struct fuzzy_state *self, const unsigned char *buffer,
return 0;
}
+static int memcpy_eliminate_sequences(char *dst, const char *src,
+ int n) {
+ const char *srcend = src + n;
+ assert(n >= 0);
+ if(src < srcend) *dst++ = *src++;
+ if(src < srcend) *dst++ = *src++;
+ if(src < srcend) *dst++ = *src++;
+ while(src < srcend)
+ if(*src == dst[-1] && *src == dst[-2] && *src == dst[-3]) {
+ ++src;
+ --n;
+ } else
+ *dst++ = *src++;
+ return n;
+}
+
#ifdef S_SPLINT_S
extern const int EOVERFLOW;
#endif
-int fuzzy_digest(const struct fuzzy_state *self, /*@out@*/ char *result) {
+int fuzzy_digest(const struct fuzzy_state *self, /*@out@*/ char *result,
+ unsigned int flags) {
unsigned int bi = self->bhstart;
uint32_t h = roll_sum(&self->roll);
- int i, remain = FUZZY_MAX_RESULT - 1;
+ int i, remain = FUZZY_MAX_RESULT - 1; /* Exclude terminating '\0'. */
/* Verify that our elimination was not overeager. */
assert(bi == 0 || (size_t)SSDEEP_BS(bi) / 2 * SPAMSUM_LENGTH <
self->total_size);
@@ -249,36 +266,60 @@ int fuzzy_digest(const struct fuzzy_state *self, /*@out@*/ char *result) {
remain -= i;
result += i;
i = (int)self->bh[bi].dlen;
- if(i > remain)
- i = remain;
- memcpy(result, self->bh[bi].digest, (size_t)i);
+ assert(i <= remain);
+ if((flags & FUZZY_FLAG_ELIMSEQ) != 0)
+ i = memcpy_eliminate_sequences(result, self->bh[bi].digest, i);
+ else
+ memcpy(result, self->bh[bi].digest, (size_t)i);
result += i;
remain -= i;
- if(remain > 0 && h != 0) {
- *result++ = b64[self->bh[bi].h % 64];
- --remain;
- }
- if(remain > 0) {
- *result++ = ':';
- --remain;
+ if(h != 0) {
+ assert(remain > 0);
+ *result = b64[self->bh[bi].h % 64];
+ if((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 ||
+ *result != result[-1] ||
+ *result != result[-2] ||
+ *result != result[-3]) {
+ ++result;
+ --remain;
+ }
}
+ assert(remain > 0);
+ *result++ = ':';
+ --remain;
if(bi < self->bhend - 1) {
++bi;
i = (int)self->bh[bi].dlen;
- if(i > SPAMSUM_LENGTH / 2 - 1)
+ if((flags & FUZZY_FLAG_NOTRUNC) == 0 &&
+ i > SPAMSUM_LENGTH / 2 - 1)
i = SPAMSUM_LENGTH / 2 - 1;
- if(i > remain)
- i = remain;
- memcpy(result, self->bh[bi].digest, (size_t)i);
+ assert(i <= remain);
+ if((flags & FUZZY_FLAG_ELIMSEQ) != 0)
+ i = memcpy_eliminate_sequences(result,
+ self->bh[bi].digest, i);
+ else
+ memcpy(result, self->bh[bi].digest, (size_t)i);
result += i;
remain -= i;
- if(remain > 0 && h != 0) {
- *result++ = b64[self->bh[bi].halfh % 64];
- --remain;
+ if(h != 0) {
+ assert(remain > 0);
+ h = (flags & FUZZY_FLAG_NOTRUNC) != 0 ? self->bh[bi].h :
+ self->bh[bi].halfh;
+ *result = b64[h % 64];
+ if((flags & FUZZY_FLAG_ELIMSEQ) == 0 || i < 3 ||
+ *result != result[-1] ||
+ *result != result[-2] ||
+ *result != result[-3]) {
+ ++result;
+ --remain;
+ }
}
- } else if(remain > 0 && h != 0) {
+ } else if(h != 0) {
assert(self->bh[bi].dlen == 0);
+ assert(remain > 0);
*result++ = b64[self->bh[bi].h % 64];
+ /* No need to bother with FUZZY_FLAG_ELIMSEQ, because this
+ * digest has length 1. */
--remain;
}
*result = '\0';
@@ -297,7 +338,7 @@ int fuzzy_hash_buf(const unsigned char *buf, uint32_t buf_len,
return -1;
if(fuzzy_update(ctx, buf, buf_len) < 0)
goto out;
- if(fuzzy_digest(ctx, result) < 0)
+ if(fuzzy_digest(ctx, result, 0) < 0)
goto out;
ret = 0;
out:
@@ -321,7 +362,7 @@ int fuzzy_hash_stream(FILE *handle, /*@out@*/ char *result) {
}
if(ferror(handle) != 0)
goto out;
- if(fuzzy_digest(ctx, result) < 0)
+ if(fuzzy_digest(ctx, result, 0) < 0)
goto out;
ret = 0;
out:
diff --git a/fuzzy.h b/fuzzy.h
index e08d9f1..1b3da8d 100644
--- a/fuzzy.h
+++ b/fuzzy.h
@@ -30,6 +30,17 @@ extern "C" {
#ifndef FUZZY_H
#define FUZZY_H
+/**
+ * @brief fuzzy_digest flag indicating to eliminate sequences of more than
+ * three identical characters
+ */
+#define FUZZY_FLAG_ELIMSEQ 0x1u
+/**
+ * @brief fuzzy_digest flag indicating not to truncate the second part to
+ * SPAMSUM_LENGTH/2 characters.
+ */
+#define FUZZY_FLAG_NOTRUNC 0x2u
+
struct fuzzy_state;
/**
@@ -60,10 +71,12 @@ extern int fuzzy_update(struct fuzzy_state *state, const unsigned char *buffer,
* concatenation of the data previously fed using fuzzy_update.
* @param result Where the fuzzy hash is stored. This variable
* must be allocated to hold at least FUZZY_MAX_RESULT bytes.
+ * @param flags is a bitwise or of FUZZY_FLAG_* macros. The absence of flags is
+ * represented by a zero.
* @return zero on success, non-zero on error
*/
extern int fuzzy_digest(const struct fuzzy_state *state,
- /*@out@*/ char *result);
+ /*@out@*/ char *result, unsigned int flags);
/**
* @brief Dispose a fuzzy state.
@@ -136,7 +149,7 @@ extern int fuzzy_hash_filename(const char *filename, /*@out@*/ char * result);
/** The longest possible length for a fuzzy hash signature
* (without the filename) */
-#define FUZZY_MAX_RESULT (SPAMSUM_LENGTH + (SPAMSUM_LENGTH/2 + 20))
+#define FUZZY_MAX_RESULT (2 * SPAMSUM_LENGTH + 20)
#ifdef __cplusplus
}
diff --git a/pyfuzzy.pyx b/pyfuzzy.pyx
index 2a18786..e9d870e 100644
--- a/pyfuzzy.pyx
+++ b/pyfuzzy.pyx
@@ -21,10 +21,13 @@ import os
cdef extern from "fuzzy.h":
struct fuzzy_state
cdef enum:
+ FUZZY_FLAG_ELIMSEQ
+ FUZZY_FLAG_NOTRUNC
FUZZY_MAX_RESULT
cdef extern fuzzy_state *fuzzy_new() nogil
cdef extern int fuzzy_update(fuzzy_state *, unsigned char *, size_t) nogil
- cdef extern int fuzzy_digest(fuzzy_state *, char *) nogil
+ cdef extern int fuzzy_digest(fuzzy_state *, char *,
+ unsigned int flags) nogil
cdef extern void fuzzy_free(fuzzy_state *) nogil
class FuzzyError(Exception):
@@ -56,11 +59,13 @@ cdef class FuzzyHash:
self.state = NULL
raise FuzzyError(libc.errno.errno)
- def digest(self):
+ def digest(self, elimseq=False, notrunc=False):
if self.state == NULL:
raise FuzzyError(libc.errno.EINVAL)
cdef char result[FUZZY_MAX_RESULT]
- if fuzzy_digest(self.state, result) != 0:
+ flags = (FUZZY_FLAG_ELIMSEQ if elimseq else 0) | \
+ (FUZZY_FLAG_NOTRUNC if notrunc else 0)
+ if fuzzy_digest(self.state, result, flags) != 0:
raise FuzzyError(libc.errno.errno)
return str(result)