diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-03-24 14:09:36 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-03-24 14:09:36 +0100 |
commit | 4b9ae0055299bb38b861fc3c6b65cbdfffa6d4c9 (patch) | |
tree | e512704be370bbe781973abe49ff99326da11724 | |
download | ssdeep-4b9ae0055299bb38b861fc3c6b65cbdfffa6d4c9.tar.gz |
initial checking
This is a rewrite of ssdeep's fuzzy.c to do streaming hashes. It is a
bit slower, but the memory consumption is bounded in all cases.
-rw-r--r-- | fuzzy.c | 375 |
1 files changed, 375 insertions, 0 deletions
@@ -0,0 +1,375 @@ +/* ssdeep + * Copyright (C) 2002 Andrew Tridgell <tridge@samba.org> + * Copyright (C) 2006 ManTech International Corporation + * Copyright (C) 2013 Helmut Grohne <helmut@subdivi.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + * Earlier versions of this code were named fuzzy.c and can be found at: + * http://www.samba.org/ftp/unpacked/junkcode/spamsum/ + * http://ssdeep.sf.net/ + */ + +#include <assert.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#if defined(__GNUC__) && __GNUC__ >= 3 +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#else +#define likely(x) x +#define unlikely(x) x +#endif + +#define SPAMSUM_LENGTH 64 +#define FUZZY_MAX_RESULT (SPAMSUM_LENGTH + (SPAMSUM_LENGTH/2 + 20)) +#define ROLLING_WINDOW 7 +#define MIN_BLOCKSIZE 3 +#define HASH_PRIME 0x01000193 +#define HASH_INIT 0x28021967 + +struct roll_state { + unsigned char window[ROLLING_WINDOW]; + uint32_t h1, h2, h3; + uint32_t n; +}; + +static void roll_init(/*@out@*/ struct roll_state *self) { + memset(self, 0, sizeof(struct roll_state)); +} + +/* + * a rolling hash, based on the Adler checksum. By using a rolling hash + * we can perform auto resynchronisation after inserts/deletes + + * internally, h1 is the sum of the bytes in the window and h2 + * is the sum of the bytes times the index + + * h3 is a shift/xor based rolling hash, and is mostly needed to ensure that + * we can cope with large blocksize values + */ +static void roll_hash(struct roll_state *self, unsigned char c) { + self->h2 -= self->h1; + self->h2 += ROLLING_WINDOW * (uint32_t)c; + + self->h1 += (uint32_t)c; + self->h1 -= (uint32_t)self->window[self->n % ROLLING_WINDOW]; + + self->window[self->n % ROLLING_WINDOW] = c; + self->n++; + + /* The original spamsum AND'ed this value with 0xFFFFFFFF which + * in theory should have no effect. This AND has been removed + * for performance (jk) */ + self->h3 <<= 5; + self->h3 ^= c; +} + +static uint32_t roll_sum(const struct roll_state *self) { + return self->h1 + self->h2 + self->h3; +} + +/* A simple non-rolling hash, based on the FNV hash. */ +static uint32_t sum_hash(unsigned char c, uint32_t h) { + return (h * HASH_PRIME) ^ c; +} + +/* A blockhash contains a signature state for a specific (implicit) blocksize. + * The blocksize is given by the start_blocksize from ssdeep_context times two + * to the power of the position of the element in the blockhashes member of + * ssdeep_context (head position = 0). The h and halfh members are the FNV + * hashes, where halfh stops to be reset after digest is SPAMSUM_LENGTH/2 long. + * The halfh hash is needed be able to truncate digest for the second output + * hash to stay compatible with ssdeep output. */ +struct blockhash_context { + uint32_t h, halfh; + char digest[SPAMSUM_LENGTH]; + unsigned int dlen; + /*@null@*/ struct blockhash_context *next; +}; + +struct ssdeep_context { + unsigned int start_blocksize; + struct blockhash_context *blockhashes; + size_t total_size; + struct roll_state roll; +}; + +static int ssdeep_init(/*@out@*/ struct ssdeep_context *self) { + self->blockhashes = malloc(sizeof(struct blockhash_context)); + if(NULL == self->blockhashes) + return -1; + self->start_blocksize = MIN_BLOCKSIZE; + self->blockhashes->h = HASH_INIT; + self->blockhashes->halfh = HASH_INIT; + self->blockhashes->dlen = 0; + self->blockhashes->next = NULL; + self->total_size = 0; + roll_init(&self->roll); + return 0; +} + +static const char *b64 = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +static int ssdeep_engine_step(struct ssdeep_context *self, unsigned char c) { + size_t h; + unsigned int bs; + struct blockhash_context *bh; + /* At each character we update the rolling hash and the normal hashes. + * When the rolling hash hits a reset value then we emit a normal hash + * as a element of the signature and reset the normal hash. */ + roll_hash(&self->roll, c); + h = roll_sum(&self->roll); + + for(bh = self->blockhashes; NULL != bh; bh = bh->next) { + bh->h = sum_hash(c, bh->h); + bh->halfh = sum_hash(c, bh->halfh); + } + + bs = self->start_blocksize; + bh = self->blockhashes; + while(bh != NULL) { + /* With growing blocksize almost no runs fail the next test. */ + if(likely(h % bs != bs - 1)) + /* Once this condition is false for one bs, it is + * automatically false for all further bs. I.e. if + * h === -1 (mod 2*bs) then h === -1 (mod bs). */ + break; + /* We don't need bs until next iteration. Update now, so we + * don't forget about it. */ + bs *= 2; + /* We have hit a reset point. We now emit hashes which are + * based on all characters in the piece of the message between + * the last reset point and this one */ + if(unlikely(0 == bh->dlen)) { /* Can only happen 30 times. */ + /* First step for this blocksize. Clone next. */ + assert(NULL == bh->next); + bh->next = malloc(sizeof( + struct blockhash_context)); + if(NULL == bh->next) + return -1; + bh->next->h = bh->h; + bh->next->dlen = 0; + bh->next->next = NULL; + } + if(bh->dlen < SPAMSUM_LENGTH - 1) { + /* We can have a problem with the tail overflowing. The + * easiest way to cope with this is to only reset the + * normal hash if we have room for more characters in + * our signature. This has the effect of combining the + * last few pieces of the message into a single piece + * */ + bh->digest[bh->dlen++] = b64[bh->h % 64]; + bh->h = HASH_INIT; + if(bh->dlen < SPAMSUM_LENGTH / 2) + bh->halfh = HASH_INIT; + } else if(NULL != bh->next && + (size_t)self->start_blocksize * SPAMSUM_LENGTH < + self->total_size && + bh->next->dlen >= SPAMSUM_LENGTH / 2 && + bh == self->blockhashes) { + /* Operating on the currently smallest blocksize and + * the next blocksize is already large enough. */ + self->start_blocksize *= 2; + bh = bh->next; + free(self->blockhashes); + self->blockhashes = bh; + continue; + } + bh = bh->next; + } + ++self->total_size; + return 0; +} + +static int ssdeep_engine(struct ssdeep_context *self, + const unsigned char *buffer, size_t buffer_size) { + for( ;buffer_size > 0; ++buffer, --buffer_size) + if(ssdeep_engine_step(self, *buffer) < 0) + return -1; + return 0; +} + +static void ssdeep_digest(const struct ssdeep_context *self, + /*@out@*/ char *result) { + unsigned int bs = self->start_blocksize; + uint32_t h = roll_sum(&self->roll); + int i, remain = FUZZY_MAX_RESULT - 1; + const struct blockhash_context *bh = self->blockhashes, *bhp; + /* Verify that our elimination was not overeager. */ + assert(bs == MIN_BLOCKSIZE || + (size_t)bs / 2 * SPAMSUM_LENGTH < self->total_size); + + /* Initial blocksize guess. */ + while((size_t)bs * SPAMSUM_LENGTH < self->total_size) { + bs *= 2; + assert(bh->next != NULL); + bh = bh->next; + } + /* Adapt blocksize guess to actual digest length. */ + while(bs > self->start_blocksize && bh->dlen < SPAMSUM_LENGTH / 2) { + bs /= 2; + /* Slow version of bh = bh->prev; */ + for(bhp = self->blockhashes; bhp->next != bh; bhp = bhp->next) + ; + bh = bhp; + } + assert(bh != NULL); + assert(!(bs > MIN_BLOCKSIZE && bh->dlen < SPAMSUM_LENGTH / 2)); + + i = snprintf(result, (size_t)remain, "%u:", bs); + /* In theory snprintf can fail. It is unclear how though, so we assume + * that it doesn't for simplicity. */ + assert(i > 0); + assert(i < remain); + remain -= i; + result += i; + i = (int)bh->dlen; + if(i > remain) + i = remain; + memcpy(result, bh->digest, (size_t)i); + result += i; + remain -= i; + if(remain > 0 && h != 0) { + *result++ = b64[bh->h % 64]; + --remain; + } + if(remain > 0) { + *result++ = ':'; + --remain; + } + if(NULL != bh->next) { + bh = bh->next; + i = (int)bh->dlen; + if(i > SPAMSUM_LENGTH / 2 - 1) + i = SPAMSUM_LENGTH / 2 - 1; + if(i > remain) + i = remain; + memcpy(result, bh->digest, (size_t)i); + result += i; + remain -= i; + if(remain > 0 && h != 0) { + *result++ = b64[bh->halfh % 64]; + --remain; + } + } else if(remain > 0 && h != 0) { + assert(bh->dlen == 0); + *result++ = b64[bh->h % 64]; + --remain; + } + *result = '\0'; +} + +static void ssdeep_clear(struct ssdeep_context *self) { + struct blockhash_context *bh, *bhn; + bh = self->blockhashes; + while(bh) { + bhn = bh->next; + free(bh); + bh = bhn; + } +} + +int fuzzy_hash_buf(const unsigned char *buf, uint32_t buf_len, + /*@out@*/ char *result) { + struct ssdeep_context ctx[1]; + if(ssdeep_init(ctx) < 0) + return -1; + if(ssdeep_engine(ctx, buf, buf_len) < 0) { + ssdeep_clear(ctx); + return -1; + } + ssdeep_digest(ctx, result); + ssdeep_clear(ctx); + return 0; +} + +int fuzzy_hash_stream(FILE *handle, /*@out@*/ char *result) { + struct ssdeep_context ctx[1]; + unsigned char buffer[4096]; + size_t n; + if(ssdeep_init(ctx) < 0) + return -1; + for(;;) { + n = fread(buffer, 1, 4096, handle); + if(0 == n) + break; + if(ssdeep_engine(ctx, buffer, n) < 0) + goto errout; + } + if(ferror(handle) != 0) + goto errout; + ssdeep_digest(ctx, result); + ssdeep_clear(ctx); + return 0; +errout: + ssdeep_clear(ctx); + return -1; +} + +#ifdef S_SPLINT_S +typedef size_t off_t; +#endif + +int fuzzy_hash_file(FILE *handle, /*@out@*/ char *result) { + off_t fpos; + int status; + fpos = ftello(handle); + if(fseek(handle, 0, SEEK_SET) < 0) + return -1; + status = fuzzy_hash_stream(handle, result); + if(status == 0) + if(fseeko(handle, fpos, SEEK_SET) < 0) + return -1; + return status; +} + +int fuzzy_hash_filename(const char *filename, /*@out@*/ char *result) { + int status; + FILE *handle = fopen(filename, "rb"); + if(NULL == handle) + return -1; + status = fuzzy_hash_file(handle, result); + /* We cannot do anything about an fclose failure. */ + (void)fclose(handle); + return status; +} + +#ifdef S_SPLINT_S +extern /*@only@*/ /*@null@*/ char *realpath(const char *, + /*@null@*/ const char *); +#endif + +int main(int argc, char **argv) { + char digest[FUZZY_MAX_RESULT], *p; + int i; + puts("ssdeep,1.1--blocksize:hash:hash,filename"); + for(i = 1; i < argc; ++i) { + p = realpath(argv[i], NULL); + if(p == NULL) + return 1; + if(fuzzy_hash_filename(argv[i], digest) < 0) { + free(p); + return 1; + } + printf("%s,\"%s\"\n", digest, p); + free(p); + } + return 0; +} |