From a8671a9c2ebce7d958fb1cd26a0fab7969d6902b Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Mon, 25 Mar 2013 13:00:48 +0100 Subject: implement variants of the hashes FUZZY_FLAG_ELIMSEQ: The comparison operation runs eliminate_sequence before actually comparing two hashes on both of them. This step can be moved to hash generation time using this flag. Suggested by Niels Thykier. FUZZY_FLAG_NOTRUNC: The second part of the hash is truncated to SPAMSUM_LENGTH/2 by default. When comparing two hashes with different blocksize this can result in a larger edit distance and therefore false negatives. --- fuzzy.h | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) (limited to 'fuzzy.h') diff --git a/fuzzy.h b/fuzzy.h index e08d9f1..1b3da8d 100644 --- a/fuzzy.h +++ b/fuzzy.h @@ -30,6 +30,17 @@ extern "C" { #ifndef FUZZY_H #define FUZZY_H +/** + * @brief fuzzy_digest flag indicating to eliminate sequences of more than + * three identical characters + */ +#define FUZZY_FLAG_ELIMSEQ 0x1u +/** + * @brief fuzzy_digest flag indicating not to truncate the second part to + * SPAMSUM_LENGTH/2 characters. + */ +#define FUZZY_FLAG_NOTRUNC 0x2u + struct fuzzy_state; /** @@ -60,10 +71,12 @@ extern int fuzzy_update(struct fuzzy_state *state, const unsigned char *buffer, * concatenation of the data previously fed using fuzzy_update. * @param result Where the fuzzy hash is stored. This variable * must be allocated to hold at least FUZZY_MAX_RESULT bytes. + * @param flags is a bitwise or of FUZZY_FLAG_* macros. The absence of flags is + * represented by a zero. * @return zero on success, non-zero on error */ extern int fuzzy_digest(const struct fuzzy_state *state, - /*@out@*/ char *result); + /*@out@*/ char *result, unsigned int flags); /** * @brief Dispose a fuzzy state. @@ -136,7 +149,7 @@ extern int fuzzy_hash_filename(const char *filename, /*@out@*/ char * result); /** The longest possible length for a fuzzy hash signature * (without the filename) */ -#define FUZZY_MAX_RESULT (SPAMSUM_LENGTH + (SPAMSUM_LENGTH/2 + 20)) +#define FUZZY_MAX_RESULT (2 * SPAMSUM_LENGTH + 20) #ifdef __cplusplus } -- cgit v1.2.3