/* ssdeep * Copyright (C) 2002 Andrew Tridgell * Copyright (C) 2006 ManTech International Corporation * Copyright (C) 2013 Helmut Grohne * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * * Earlier versions of this code were named fuzzy.c and can be found at: * http://www.samba.org/ftp/unpacked/junkcode/spamsum/ * http://ssdeep.sf.net/ */ #include #include #include #include #include #if defined(__GNUC__) && __GNUC__ >= 3 #define likely(x) __builtin_expect(!!(x), 1) #define unlikely(x) __builtin_expect(!!(x), 0) #else #define likely(x) x #define unlikely(x) x #endif #define SPAMSUM_LENGTH 64 #define FUZZY_MAX_RESULT (SPAMSUM_LENGTH + (SPAMSUM_LENGTH/2 + 20)) #define ROLLING_WINDOW 7 #define MIN_BLOCKSIZE 3 #define HASH_PRIME 0x01000193 #define HASH_INIT 0x28021967 struct roll_state { unsigned char window[ROLLING_WINDOW]; uint32_t h1, h2, h3; uint32_t n; }; static void roll_init(/*@out@*/ struct roll_state *self) { memset(self, 0, sizeof(struct roll_state)); } /* * a rolling hash, based on the Adler checksum. By using a rolling hash * we can perform auto resynchronisation after inserts/deletes * internally, h1 is the sum of the bytes in the window and h2 * is the sum of the bytes times the index * h3 is a shift/xor based rolling hash, and is mostly needed to ensure that * we can cope with large blocksize values */ static void roll_hash(struct roll_state *self, unsigned char c) { self->h2 -= self->h1; self->h2 += ROLLING_WINDOW * (uint32_t)c; self->h1 += (uint32_t)c; self->h1 -= (uint32_t)self->window[self->n % ROLLING_WINDOW]; self->window[self->n % ROLLING_WINDOW] = c; self->n++; /* The original spamsum AND'ed this value with 0xFFFFFFFF which * in theory should have no effect. This AND has been removed * for performance (jk) */ self->h3 <<= 5; self->h3 ^= c; } static uint32_t roll_sum(const struct roll_state *self) { return self->h1 + self->h2 + self->h3; } /* A simple non-rolling hash, based on the FNV hash. */ static uint32_t sum_hash(unsigned char c, uint32_t h) { return (h * HASH_PRIME) ^ c; } /* A blockhash contains a signature state for a specific (implicit) blocksize. * The blocksize is given by the start_blocksize from ssdeep_context times two * to the power of the position of the element in the blockhashes member of * ssdeep_context (head position = 0). The h and halfh members are the FNV * hashes, where halfh stops to be reset after digest is SPAMSUM_LENGTH/2 long. * The halfh hash is needed be able to truncate digest for the second output * hash to stay compatible with ssdeep output. */ struct blockhash_context { uint32_t h, halfh; char digest[SPAMSUM_LENGTH]; unsigned int dlen; /*@null@*/ struct blockhash_context *next; }; struct ssdeep_context { unsigned int start_blocksize; struct blockhash_context *blockhashes; size_t total_size; struct roll_state roll; }; static int ssdeep_init(/*@out@*/ struct ssdeep_context *self) { self->blockhashes = malloc(sizeof(struct blockhash_context)); if(NULL == self->blockhashes) return -1; self->start_blocksize = MIN_BLOCKSIZE; self->blockhashes->h = HASH_INIT; self->blockhashes->halfh = HASH_INIT; self->blockhashes->dlen = 0; self->blockhashes->next = NULL; self->total_size = 0; roll_init(&self->roll); return 0; } static const char *b64 = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; static int ssdeep_engine_step(struct ssdeep_context *self, unsigned char c) { size_t h; unsigned int bs; struct blockhash_context *bh; /* At each character we update the rolling hash and the normal hashes. * When the rolling hash hits a reset value then we emit a normal hash * as a element of the signature and reset the normal hash. */ roll_hash(&self->roll, c); h = roll_sum(&self->roll); for(bh = self->blockhashes; NULL != bh; bh = bh->next) { bh->h = sum_hash(c, bh->h); bh->halfh = sum_hash(c, bh->halfh); } bs = self->start_blocksize; bh = self->blockhashes; while(bh != NULL) { /* With growing blocksize almost no runs fail the next test. */ if(likely(h % bs != bs - 1)) /* Once this condition is false for one bs, it is * automatically false for all further bs. I.e. if * h === -1 (mod 2*bs) then h === -1 (mod bs). */ break; /* We don't need bs until next iteration. Update now, so we * don't forget about it. */ bs *= 2; /* We have hit a reset point. We now emit hashes which are * based on all characters in the piece of the message between * the last reset point and this one */ if(unlikely(0 == bh->dlen)) { /* Can only happen 30 times. */ /* First step for this blocksize. Clone next. */ assert(NULL == bh->next); bh->next = malloc(sizeof( struct blockhash_context)); if(NULL == bh->next) return -1; bh->next->h = bh->h; bh->next->dlen = 0; bh->next->next = NULL; } if(bh->dlen < SPAMSUM_LENGTH - 1) { /* We can have a problem with the tail overflowing. The * easiest way to cope with this is to only reset the * normal hash if we have room for more characters in * our signature. This has the effect of combining the * last few pieces of the message into a single piece * */ bh->digest[bh->dlen++] = b64[bh->h % 64]; bh->h = HASH_INIT; if(bh->dlen < SPAMSUM_LENGTH / 2) bh->halfh = HASH_INIT; } else if(NULL != bh->next && (size_t)self->start_blocksize * SPAMSUM_LENGTH < self->total_size && bh->next->dlen >= SPAMSUM_LENGTH / 2 && bh == self->blockhashes) { /* Operating on the currently smallest blocksize and * the next blocksize is already large enough. */ self->start_blocksize *= 2; bh = bh->next; free(self->blockhashes); self->blockhashes = bh; continue; } bh = bh->next; } ++self->total_size; return 0; } static int ssdeep_engine(struct ssdeep_context *self, const unsigned char *buffer, size_t buffer_size) { for( ;buffer_size > 0; ++buffer, --buffer_size) if(ssdeep_engine_step(self, *buffer) < 0) return -1; return 0; } static void ssdeep_digest(const struct ssdeep_context *self, /*@out@*/ char *result) { unsigned int bs = self->start_blocksize; uint32_t h = roll_sum(&self->roll); int i, remain = FUZZY_MAX_RESULT - 1; const struct blockhash_context *bh = self->blockhashes, *bhp; /* Verify that our elimination was not overeager. */ assert(bs == MIN_BLOCKSIZE || (size_t)bs / 2 * SPAMSUM_LENGTH < self->total_size); /* Initial blocksize guess. */ while((size_t)bs * SPAMSUM_LENGTH < self->total_size) { bs *= 2; assert(bh->next != NULL); bh = bh->next; } /* Adapt blocksize guess to actual digest length. */ while(bs > self->start_blocksize && bh->dlen < SPAMSUM_LENGTH / 2) { bs /= 2; /* Slow version of bh = bh->prev; */ for(bhp = self->blockhashes; bhp->next != bh; bhp = bhp->next) ; bh = bhp; } assert(bh != NULL); assert(!(bs > MIN_BLOCKSIZE && bh->dlen < SPAMSUM_LENGTH / 2)); i = snprintf(result, (size_t)remain, "%u:", bs); /* In theory snprintf can fail. It is unclear how though, so we assume * that it doesn't for simplicity. */ assert(i > 0); assert(i < remain); remain -= i; result += i; i = (int)bh->dlen; if(i > remain) i = remain; memcpy(result, bh->digest, (size_t)i); result += i; remain -= i; if(remain > 0 && h != 0) { *result++ = b64[bh->h % 64]; --remain; } if(remain > 0) { *result++ = ':'; --remain; } if(NULL != bh->next) { bh = bh->next; i = (int)bh->dlen; if(i > SPAMSUM_LENGTH / 2 - 1) i = SPAMSUM_LENGTH / 2 - 1; if(i > remain) i = remain; memcpy(result, bh->digest, (size_t)i); result += i; remain -= i; if(remain > 0 && h != 0) { *result++ = b64[bh->halfh % 64]; --remain; } } else if(remain > 0 && h != 0) { assert(bh->dlen == 0); *result++ = b64[bh->h % 64]; --remain; } *result = '\0'; } static void ssdeep_clear(struct ssdeep_context *self) { struct blockhash_context *bh, *bhn; bh = self->blockhashes; while(bh) { bhn = bh->next; free(bh); bh = bhn; } } int fuzzy_hash_buf(const unsigned char *buf, uint32_t buf_len, /*@out@*/ char *result) { struct ssdeep_context ctx[1]; if(ssdeep_init(ctx) < 0) return -1; if(ssdeep_engine(ctx, buf, buf_len) < 0) { ssdeep_clear(ctx); return -1; } ssdeep_digest(ctx, result); ssdeep_clear(ctx); return 0; } int fuzzy_hash_stream(FILE *handle, /*@out@*/ char *result) { struct ssdeep_context ctx[1]; unsigned char buffer[4096]; size_t n; if(ssdeep_init(ctx) < 0) return -1; for(;;) { n = fread(buffer, 1, 4096, handle); if(0 == n) break; if(ssdeep_engine(ctx, buffer, n) < 0) goto errout; } if(ferror(handle) != 0) goto errout; ssdeep_digest(ctx, result); ssdeep_clear(ctx); return 0; errout: ssdeep_clear(ctx); return -1; } #ifdef S_SPLINT_S typedef size_t off_t; #endif int fuzzy_hash_file(FILE *handle, /*@out@*/ char *result) { off_t fpos; int status; fpos = ftello(handle); if(fseek(handle, 0, SEEK_SET) < 0) return -1; status = fuzzy_hash_stream(handle, result); if(status == 0) if(fseeko(handle, fpos, SEEK_SET) < 0) return -1; return status; } int fuzzy_hash_filename(const char *filename, /*@out@*/ char *result) { int status; FILE *handle = fopen(filename, "rb"); if(NULL == handle) return -1; status = fuzzy_hash_file(handle, result); /* We cannot do anything about an fclose failure. */ (void)fclose(handle); return status; } #ifdef S_SPLINT_S extern /*@only@*/ /*@null@*/ char *realpath(const char *, /*@null@*/ const char *); #endif int main(int argc, char **argv) { char digest[FUZZY_MAX_RESULT], *p; int i; puts("ssdeep,1.1--blocksize:hash:hash,filename"); for(i = 1; i < argc; ++i) { p = realpath(argv[i], NULL); if(p == NULL) return 1; if(fuzzy_hash_filename(argv[i], digest) < 0) { free(p); return 1; } printf("%s,\"%s\"\n", digest, p); free(p); } return 0; }