2 * Copyright (C) 2002 Andrew Tridgell <tridge@samba.org>
3 * Copyright (C) 2006 ManTech International Corporation
4 * Copyright (C) 2013 Helmut Grohne <helmut@subdivi.de>
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 * Earlier versions of this code were named fuzzy.c and can be found at:
21 * http://www.samba.org/ftp/unpacked/junkcode/spamsum/
22 * http://ssdeep.sf.net/
33 #if defined(__GNUC__) && __GNUC__ >= 3
34 #define likely(x) __builtin_expect(!!(x), 1)
35 #define unlikely(x) __builtin_expect(!!(x), 0)
41 #define ROLLING_WINDOW 7
42 #define MIN_BLOCKSIZE 3
43 #define HASH_PRIME 0x01000193
44 #define HASH_INIT 0x28021967
45 #define NUM_BLOCKHASHES 31
48 unsigned char window[ROLLING_WINDOW];
53 static void roll_init(/*@out@*/ struct roll_state *self) {
54 memset(self, 0, sizeof(struct roll_state));
58 * a rolling hash, based on the Adler checksum. By using a rolling hash
59 * we can perform auto resynchronisation after inserts/deletes
61 * internally, h1 is the sum of the bytes in the window and h2
62 * is the sum of the bytes times the index
64 * h3 is a shift/xor based rolling hash, and is mostly needed to ensure that
65 * we can cope with large blocksize values
67 static void roll_hash(struct roll_state *self, unsigned char c) {
69 self->h2 += ROLLING_WINDOW * (uint32_t)c;
71 self->h1 += (uint32_t)c;
72 self->h1 -= (uint32_t)self->window[self->n % ROLLING_WINDOW];
74 self->window[self->n % ROLLING_WINDOW] = c;
77 /* The original spamsum AND'ed this value with 0xFFFFFFFF which
78 * in theory should have no effect. This AND has been removed
79 * for performance (jk) */
84 static uint32_t roll_sum(const struct roll_state *self) {
85 return self->h1 + self->h2 + self->h3;
88 /* A simple non-rolling hash, based on the FNV hash. */
89 static uint32_t sum_hash(unsigned char c, uint32_t h) {
90 return (h * HASH_PRIME) ^ c;
93 /* A blockhash contains a signature state for a specific (implicit) blocksize.
94 * The blocksize is given by SSDEEP_BS(index). The h and halfh members are the
95 * FNV hashes, where halfh stops to be reset after digest is SPAMSUM_LENGTH/2
96 * long. The halfh hash is needed be able to truncate digest for the second
97 * output hash to stay compatible with ssdeep output. */
98 struct blockhash_context {
100 char digest[SPAMSUM_LENGTH];
105 unsigned int bhstart, bhend;
106 struct blockhash_context bh[NUM_BLOCKHASHES];
108 struct roll_state roll;
111 #define SSDEEP_BS(index) (((uint32_t)MIN_BLOCKSIZE) << (index))
113 /*@only@*/ /*@null@*/ struct fuzzy_state *fuzzy_new(void) {
114 struct fuzzy_state *self;
115 if(NULL == (self = malloc(sizeof(struct fuzzy_state))))
116 /* malloc sets ENOMEM */
120 self->bh[0].h = HASH_INIT;
121 self->bh[0].halfh = HASH_INIT;
122 self->bh[0].dlen = 0;
123 self->total_size = 0;
124 roll_init(&self->roll);
128 static void fuzzy_try_fork_blockhash(struct fuzzy_state *self) {
129 struct blockhash_context *obh, *nbh;
130 if(self->bhend >= NUM_BLOCKHASHES)
132 assert(self->bhend > 0);
133 obh = self->bh + (self->bhend - 1);
136 nbh->halfh = obh->halfh;
141 static void fuzzy_try_reduce_blockhash(struct fuzzy_state *self) {
142 assert(self->bhstart < self->bhend);
143 if(self->bhend - self->bhstart < 2)
144 /* Need at least two working hashes. */
146 if((size_t)SSDEEP_BS(self->bhstart) * SPAMSUM_LENGTH >=
148 /* Initial blocksize estimate would select this or a smaller
151 if(self->bh[self->bhstart + 1].dlen < SPAMSUM_LENGTH / 2)
152 /* Estimate adjustment would select this blocksize. */
154 /* At this point we are clearly no longer interested in the
155 * start_blocksize. Get rid of it. */
159 static const char *b64 =
160 "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
162 static void fuzzy_engine_step(struct fuzzy_state *self, unsigned char c) {
165 /* At each character we update the rolling hash and the normal hashes.
166 * When the rolling hash hits a reset value then we emit a normal hash
167 * as a element of the signature and reset the normal hash. */
168 roll_hash(&self->roll, c);
169 h = roll_sum(&self->roll);
171 for(i = self->bhstart; i < self->bhend; ++i) {
172 self->bh[i].h = sum_hash(c, self->bh[i].h);
173 self->bh[i].halfh = sum_hash(c, self->bh[i].halfh);
176 for(i = self->bhstart; i < self->bhend; ++i) {
177 /* With growing blocksize almost no runs fail the next test. */
178 if(likely(h % SSDEEP_BS(i) != SSDEEP_BS(i) - 1))
179 /* Once this condition is false for one bs, it is
180 * automatically false for all further bs. I.e. if
181 * h === -1 (mod 2*bs) then h === -1 (mod bs). */
183 /* We have hit a reset point. We now emit hashes which are
184 * based on all characters in the piece of the message between
185 * the last reset point and this one */
186 if(unlikely(0 == self->bh[i].dlen)) {
187 /* Can only happen 30 times. */
188 /* First step for this blocksize. Clone next. */
189 fuzzy_try_fork_blockhash(self);
191 if(self->bh[i].dlen < SPAMSUM_LENGTH - 1) {
192 /* We can have a problem with the tail overflowing. The
193 * easiest way to cope with this is to only reset the
194 * normal hash if we have room for more characters in
195 * our signature. This has the effect of combining the
196 * last few pieces of the message into a single piece
198 self->bh[i].digest[self->bh[i].dlen++] =
199 b64[self->bh[i].h % 64];
200 self->bh[i].h = HASH_INIT;
201 if(self->bh[i].dlen < SPAMSUM_LENGTH / 2)
202 self->bh[i].halfh = HASH_INIT;
204 fuzzy_try_reduce_blockhash(self);
208 int fuzzy_update(struct fuzzy_state *self, const unsigned char *buffer,
209 size_t buffer_size) {
210 self->total_size += buffer_size;
211 for( ;buffer_size > 0; ++buffer, --buffer_size)
212 fuzzy_engine_step(self, *buffer);
216 int fuzzy_digest(const struct fuzzy_state *self, /*@out@*/ char *result) {
217 unsigned int bi = self->bhstart;
218 uint32_t h = roll_sum(&self->roll);
219 int i, remain = FUZZY_MAX_RESULT - 1;
220 /* Verify that our elimination was not overeager. */
221 assert(bi == 0 || (size_t)SSDEEP_BS(bi) / 2 * SPAMSUM_LENGTH <
224 /* Initial blocksize guess. */
225 while((size_t)SSDEEP_BS(bi) * SPAMSUM_LENGTH < self->total_size) {
227 if(bi >= NUM_BLOCKHASHES) {
228 /* The input exceeds data types. */
233 /* Adapt blocksize guess to actual digest length. */
234 while(bi >= self->bhend)
236 while(bi > self->bhstart && self->bh[bi].dlen < SPAMSUM_LENGTH / 2)
238 assert(!(bi > 0 && self->bh[bi].dlen < SPAMSUM_LENGTH / 2));
240 i = snprintf(result, (size_t)remain, "%u:", SSDEEP_BS(bi));
242 /* Maybe snprintf has set errno here? */
247 i = (int)self->bh[bi].dlen;
250 memcpy(result, self->bh[bi].digest, (size_t)i);
253 if(remain > 0 && h != 0) {
254 *result++ = b64[self->bh[bi].h % 64];
261 if(bi < self->bhend - 1) {
263 i = (int)self->bh[bi].dlen;
264 if(i > SPAMSUM_LENGTH / 2 - 1)
265 i = SPAMSUM_LENGTH / 2 - 1;
268 memcpy(result, self->bh[bi].digest, (size_t)i);
271 if(remain > 0 && h != 0) {
272 *result++ = b64[self->bh[bi].halfh % 64];
275 } else if(remain > 0 && h != 0) {
276 assert(self->bh[bi].dlen == 0);
277 *result++ = b64[self->bh[bi].h % 64];
284 void fuzzy_free(/*@only@*/ struct fuzzy_state *self) {
288 int fuzzy_hash_buf(const unsigned char *buf, uint32_t buf_len,
289 /*@out@*/ char *result) {
290 struct fuzzy_state *ctx;
292 if(NULL == (ctx = fuzzy_new()))
294 if(fuzzy_update(ctx, buf, buf_len) < 0)
296 if(fuzzy_digest(ctx, result) < 0)
304 int fuzzy_hash_stream(FILE *handle, /*@out@*/ char *result) {
305 struct fuzzy_state *ctx;
306 unsigned char buffer[4096];
309 if(NULL == (ctx = fuzzy_new()))
312 n = fread(buffer, 1, 4096, handle);
315 if(fuzzy_update(ctx, buffer, n) < 0)
318 if(ferror(handle) != 0)
320 if(fuzzy_digest(ctx, result) < 0)
329 typedef size_t off_t;
330 int fseeko(FILE *, off_t, int);
331 off_t ftello(FILE *);
334 int fuzzy_hash_file(FILE *handle, /*@out@*/ char *result) {
337 fpos = ftello(handle);
338 if(fseek(handle, 0, SEEK_SET) < 0)
340 status = fuzzy_hash_stream(handle, result);
342 if(fseeko(handle, fpos, SEEK_SET) < 0)
347 int fuzzy_hash_filename(const char *filename, /*@out@*/ char *result) {
349 FILE *handle = fopen(filename, "rb");
352 status = fuzzy_hash_stream(handle, result);
353 /* We cannot do anything about an fclose failure. */
354 (void)fclose(handle);