diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-03-24 17:15:40 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-03-24 17:15:40 +0100 |
commit | 4bcfb358f9c4e12a6cb4e1f519916006050898a8 (patch) | |
tree | a19c21646532de03fb8655d6910f48cb4c611048 | |
parent | 2107fa0e4f2ac170241a6da2e7233c4892ea5ec9 (diff) | |
download | ssdeep-4bcfb358f9c4e12a6cb4e1f519916006050898a8.tar.gz |
turn linked list of blockhashes into an array
* less memory management, everything we need fit in 2.5kb
* less scattering of data
* less code
-rw-r--r-- | fuzzy.c | 160 |
1 files changed, 65 insertions, 95 deletions
@@ -41,6 +41,7 @@ #define MIN_BLOCKSIZE 3 #define HASH_PRIME 0x01000193 #define HASH_INIT 0x28021967 +#define NUM_BLOCKHASHES 31 struct roll_state { unsigned char window[ROLLING_WINDOW]; @@ -99,65 +100,60 @@ struct blockhash_context { uint32_t h, halfh; char digest[SPAMSUM_LENGTH]; unsigned int dlen; - /*@null@*/ /*@only@*/ struct blockhash_context *next; }; -static int blockhash_fork(struct blockhash_context *bh) { - struct blockhash_context *nbh; - if(NULL == (nbh = malloc(sizeof(struct blockhash_context)))) - return -1; - nbh->h = bh->h; - nbh->halfh = bh->halfh; - nbh->dlen = 0; - nbh->next = NULL; - assert(NULL == bh->next); - bh->next = nbh; - return 0; -} - struct ssdeep_context { - unsigned int start_blocksize; - /*@only@*/ struct blockhash_context *blockhashes; + unsigned int bhstart, bhend; + struct blockhash_context bh[NUM_BLOCKHASHES]; size_t total_size; struct roll_state roll; }; +#define SSDEEP_BS(index) (((uint32_t)MIN_BLOCKSIZE) << (index)) + static /*@only@*/ /*@null@*/ struct ssdeep_context *ssdeep_new(void) { struct ssdeep_context *self; if(NULL == (self = malloc(sizeof(struct ssdeep_context)))) return NULL; - self->blockhashes = malloc(sizeof(struct blockhash_context)); - if(NULL == self->blockhashes) { - free(self); - return NULL; - } - self->start_blocksize = MIN_BLOCKSIZE; - self->blockhashes->h = HASH_INIT; - self->blockhashes->halfh = HASH_INIT; - self->blockhashes->dlen = 0; - self->blockhashes->next = NULL; + self->bhstart = 0; + self->bhend = 1; + self->bh[0].h = HASH_INIT; + self->bh[0].halfh = HASH_INIT; + self->bh[0].dlen = 0; self->total_size = 0; roll_init(&self->roll); return self; } +static void ssdeep_try_fork_blockhash(struct ssdeep_context *self) { + struct blockhash_context *obh, *nbh; + if(self->bhend >= NUM_BLOCKHASHES) + return; + assert(self->bhend > 0); + obh = self->bh + (self->bhend - 1); + nbh = obh + 1; + nbh->h = obh->h; + nbh->halfh = obh->halfh; + nbh->dlen = 0; + ++self->bhend; +} + static void ssdeep_try_reduce_blockhash(struct ssdeep_context *self) { - struct blockhash_context *bh; - if(NULL == (bh = self->blockhashes->next)) - /* Cannot remove last hash. */ + assert(self->bhstart < self->bhend); + if(self->bhend - self->bhstart < 2) + /* Need at least two working hashes. */ return; - if((size_t)self->start_blocksize * SPAMSUM_LENGTH >= self->total_size) + if((size_t)SSDEEP_BS(self->bhstart) * SPAMSUM_LENGTH >= + self->total_size) /* Initial blocksize estimate would select this or a smaller * blocksize. */ return; - if(bh->dlen < SPAMSUM_LENGTH / 2) + if(self->bh[self->bhstart + 1].dlen < SPAMSUM_LENGTH / 2) /* Estimate adjustment would select this blocksize. */ return; /* At this point we are clearly no longer interested in the * start_blocksize. Get rid of it. */ - self->start_blocksize *= 2; - free(self->blockhashes); - self->blockhashes = bh; + ++self->bhstart; } static const char *b64 = @@ -165,24 +161,21 @@ static const char *b64 = static int ssdeep_engine_step(struct ssdeep_context *self, unsigned char c) { size_t h; - unsigned int bs; - struct blockhash_context *bh; + unsigned int i; /* At each character we update the rolling hash and the normal hashes. * When the rolling hash hits a reset value then we emit a normal hash * as a element of the signature and reset the normal hash. */ roll_hash(&self->roll, c); h = roll_sum(&self->roll); - for(bh = self->blockhashes; NULL != bh; bh = bh->next) { - bh->h = sum_hash(c, bh->h); - bh->halfh = sum_hash(c, bh->halfh); + for(i = self->bhstart; i < self->bhend; ++i) { + self->bh[i].h = sum_hash(c, self->bh[i].h); + self->bh[i].halfh = sum_hash(c, self->bh[i].halfh); } - bs = self->start_blocksize; - bh = self->blockhashes; - while(bh != NULL) { + for(i = self->bhstart; i < self->bhend; ++i) { /* With growing blocksize almost no runs fail the next test. */ - if(likely(h % bs != bs - 1)) + if(likely(h % SSDEEP_BS(i) != SSDEEP_BS(i) - 1)) /* Once this condition is false for one bs, it is * automatically false for all further bs. I.e. if * h === -1 (mod 2*bs) then h === -1 (mod bs). */ @@ -190,33 +183,25 @@ static int ssdeep_engine_step(struct ssdeep_context *self, unsigned char c) { /* We have hit a reset point. We now emit hashes which are * based on all characters in the piece of the message between * the last reset point and this one */ - if(unlikely(0 == bh->dlen && bs < UINT32_MAX / 2)) { + if(unlikely(0 == self->bh[i].dlen)) { /* Can only happen 30 times. */ /* First step for this blocksize. Clone next. */ - if(blockhash_fork(bh) < 0) - return -1; + ssdeep_try_fork_blockhash(self); } - /* We don't need bs until next iteration. Update now, so we - * don't forget about it. */ - bs *= 2; - if(bh->dlen < SPAMSUM_LENGTH - 1) { + if(self->bh[i].dlen < SPAMSUM_LENGTH - 1) { /* We can have a problem with the tail overflowing. The * easiest way to cope with this is to only reset the * normal hash if we have room for more characters in * our signature. This has the effect of combining the * last few pieces of the message into a single piece * */ - bh->digest[bh->dlen++] = b64[bh->h % 64]; - bh->h = HASH_INIT; - if(bh->dlen < SPAMSUM_LENGTH / 2) - bh->halfh = HASH_INIT; - } else { - /* The reduction might free bh. */ - bh = bh->next; + self->bh[i].digest[self->bh[i].dlen++] = + b64[self->bh[i].h % 64]; + self->bh[i].h = HASH_INIT; + if(self->bh[i].dlen < SPAMSUM_LENGTH / 2) + self->bh[i].halfh = HASH_INIT; + } else ssdeep_try_reduce_blockhash(self); - continue; - } - bh = bh->next; } ++self->total_size; return 0; @@ -232,70 +217,62 @@ static int ssdeep_engine(struct ssdeep_context *self, static int ssdeep_digest(const struct ssdeep_context *self, /*@out@*/ char *result) { - unsigned int bs = self->start_blocksize; + unsigned int bi = self->bhstart; uint32_t h = roll_sum(&self->roll); int i, remain = FUZZY_MAX_RESULT - 1; - const struct blockhash_context *bh = self->blockhashes, *bhp; /* Verify that our elimination was not overeager. */ - assert(bs == MIN_BLOCKSIZE || - (size_t)bs / 2 * SPAMSUM_LENGTH < self->total_size); + assert(bi == 0 || (size_t)SSDEEP_BS(bi) / 2 * SPAMSUM_LENGTH < + self->total_size); /* Initial blocksize guess. */ - while((size_t)bs * SPAMSUM_LENGTH < self->total_size) { - bs *= 2; - if(bh->next == NULL) + while((size_t)SSDEEP_BS(bi) * SPAMSUM_LENGTH < self->total_size) { + ++bi; + if(bi >= self->bhend) /* The input exceeds data types. */ return -1; - bh = bh->next; } /* Adapt blocksize guess to actual digest length. */ - while(bs > self->start_blocksize && bh->dlen < SPAMSUM_LENGTH / 2) { - bs /= 2; - /* Slow version of bh = bh->prev; */ - for(bhp = self->blockhashes; bhp->next != bh; bhp = bhp->next) - ; - bh = bhp; - } - assert(bh != NULL); - assert(!(bs > MIN_BLOCKSIZE && bh->dlen < SPAMSUM_LENGTH / 2)); + while(bi > self->bhstart && self->bh[bi].dlen < SPAMSUM_LENGTH / 2) + --bi; + assert(!(bi > 0 && self->bh[bi].dlen < SPAMSUM_LENGTH / 2)); - i = snprintf(result, (size_t)remain, "%u:", bs); + i = snprintf(result, (size_t)remain, "%u:", SSDEEP_BS(bi)); if(i <= 0) return -1; assert(i < remain); remain -= i; result += i; - i = (int)bh->dlen; + i = (int)self->bh[bi].dlen; if(i > remain) i = remain; - memcpy(result, bh->digest, (size_t)i); + memcpy(result, self->bh[bi].digest, (size_t)i); result += i; remain -= i; if(remain > 0 && h != 0) { - *result++ = b64[bh->h % 64]; + *result++ = b64[self->bh[bi].h % 64]; --remain; } if(remain > 0) { *result++ = ':'; --remain; } - if(NULL != bh->next) { - bh = bh->next; - i = (int)bh->dlen; + if(bi < self->bhend - 1) { + ++bi; + i = (int)self->bh[bi].dlen; if(i > SPAMSUM_LENGTH / 2 - 1) i = SPAMSUM_LENGTH / 2 - 1; if(i > remain) i = remain; - memcpy(result, bh->digest, (size_t)i); + memcpy(result, self->bh[bi].digest, (size_t)i); result += i; remain -= i; if(remain > 0 && h != 0) { - *result++ = b64[bh->halfh % 64]; + *result++ = b64[self->bh[bi].halfh % 64]; --remain; } } else if(remain > 0 && h != 0) { - assert(bh->dlen == 0); - *result++ = b64[bh->h % 64]; + assert(self->bh[bi].dlen == 0); + *result++ = b64[self->bh[bi].h % 64]; --remain; } *result = '\0'; @@ -303,13 +280,6 @@ static int ssdeep_digest(const struct ssdeep_context *self, } static void ssdeep_free(/*@only@*/ struct ssdeep_context *self) { - struct blockhash_context *bh, *bhn; - bh = self->blockhashes; - while(bh) { - bhn = bh->next; - free(bh); - bh = bhn; - } free(self); } |