summaryrefslogtreecommitdiff
path: root/fuzzy.c
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-03-24 17:15:40 +0100
committerHelmut Grohne <helmut@subdivi.de>2013-03-24 17:15:40 +0100
commit4bcfb358f9c4e12a6cb4e1f519916006050898a8 (patch)
treea19c21646532de03fb8655d6910f48cb4c611048 /fuzzy.c
parent2107fa0e4f2ac170241a6da2e7233c4892ea5ec9 (diff)
downloadssdeep-4bcfb358f9c4e12a6cb4e1f519916006050898a8.tar.gz
turn linked list of blockhashes into an array
* less memory management, everything we need fit in 2.5kb * less scattering of data * less code
Diffstat (limited to 'fuzzy.c')
-rw-r--r--fuzzy.c160
1 files changed, 65 insertions, 95 deletions
diff --git a/fuzzy.c b/fuzzy.c
index 88af850..83cf3ce 100644
--- a/fuzzy.c
+++ b/fuzzy.c
@@ -41,6 +41,7 @@
#define MIN_BLOCKSIZE 3
#define HASH_PRIME 0x01000193
#define HASH_INIT 0x28021967
+#define NUM_BLOCKHASHES 31
struct roll_state {
unsigned char window[ROLLING_WINDOW];
@@ -99,65 +100,60 @@ struct blockhash_context {
uint32_t h, halfh;
char digest[SPAMSUM_LENGTH];
unsigned int dlen;
- /*@null@*/ /*@only@*/ struct blockhash_context *next;
};
-static int blockhash_fork(struct blockhash_context *bh) {
- struct blockhash_context *nbh;
- if(NULL == (nbh = malloc(sizeof(struct blockhash_context))))
- return -1;
- nbh->h = bh->h;
- nbh->halfh = bh->halfh;
- nbh->dlen = 0;
- nbh->next = NULL;
- assert(NULL == bh->next);
- bh->next = nbh;
- return 0;
-}
-
struct ssdeep_context {
- unsigned int start_blocksize;
- /*@only@*/ struct blockhash_context *blockhashes;
+ unsigned int bhstart, bhend;
+ struct blockhash_context bh[NUM_BLOCKHASHES];
size_t total_size;
struct roll_state roll;
};
+#define SSDEEP_BS(index) (((uint32_t)MIN_BLOCKSIZE) << (index))
+
static /*@only@*/ /*@null@*/ struct ssdeep_context *ssdeep_new(void) {
struct ssdeep_context *self;
if(NULL == (self = malloc(sizeof(struct ssdeep_context))))
return NULL;
- self->blockhashes = malloc(sizeof(struct blockhash_context));
- if(NULL == self->blockhashes) {
- free(self);
- return NULL;
- }
- self->start_blocksize = MIN_BLOCKSIZE;
- self->blockhashes->h = HASH_INIT;
- self->blockhashes->halfh = HASH_INIT;
- self->blockhashes->dlen = 0;
- self->blockhashes->next = NULL;
+ self->bhstart = 0;
+ self->bhend = 1;
+ self->bh[0].h = HASH_INIT;
+ self->bh[0].halfh = HASH_INIT;
+ self->bh[0].dlen = 0;
self->total_size = 0;
roll_init(&self->roll);
return self;
}
+static void ssdeep_try_fork_blockhash(struct ssdeep_context *self) {
+ struct blockhash_context *obh, *nbh;
+ if(self->bhend >= NUM_BLOCKHASHES)
+ return;
+ assert(self->bhend > 0);
+ obh = self->bh + (self->bhend - 1);
+ nbh = obh + 1;
+ nbh->h = obh->h;
+ nbh->halfh = obh->halfh;
+ nbh->dlen = 0;
+ ++self->bhend;
+}
+
static void ssdeep_try_reduce_blockhash(struct ssdeep_context *self) {
- struct blockhash_context *bh;
- if(NULL == (bh = self->blockhashes->next))
- /* Cannot remove last hash. */
+ assert(self->bhstart < self->bhend);
+ if(self->bhend - self->bhstart < 2)
+ /* Need at least two working hashes. */
return;
- if((size_t)self->start_blocksize * SPAMSUM_LENGTH >= self->total_size)
+ if((size_t)SSDEEP_BS(self->bhstart) * SPAMSUM_LENGTH >=
+ self->total_size)
/* Initial blocksize estimate would select this or a smaller
* blocksize. */
return;
- if(bh->dlen < SPAMSUM_LENGTH / 2)
+ if(self->bh[self->bhstart + 1].dlen < SPAMSUM_LENGTH / 2)
/* Estimate adjustment would select this blocksize. */
return;
/* At this point we are clearly no longer interested in the
* start_blocksize. Get rid of it. */
- self->start_blocksize *= 2;
- free(self->blockhashes);
- self->blockhashes = bh;
+ ++self->bhstart;
}
static const char *b64 =
@@ -165,24 +161,21 @@ static const char *b64 =
static int ssdeep_engine_step(struct ssdeep_context *self, unsigned char c) {
size_t h;
- unsigned int bs;
- struct blockhash_context *bh;
+ unsigned int i;
/* At each character we update the rolling hash and the normal hashes.
* When the rolling hash hits a reset value then we emit a normal hash
* as a element of the signature and reset the normal hash. */
roll_hash(&self->roll, c);
h = roll_sum(&self->roll);
- for(bh = self->blockhashes; NULL != bh; bh = bh->next) {
- bh->h = sum_hash(c, bh->h);
- bh->halfh = sum_hash(c, bh->halfh);
+ for(i = self->bhstart; i < self->bhend; ++i) {
+ self->bh[i].h = sum_hash(c, self->bh[i].h);
+ self->bh[i].halfh = sum_hash(c, self->bh[i].halfh);
}
- bs = self->start_blocksize;
- bh = self->blockhashes;
- while(bh != NULL) {
+ for(i = self->bhstart; i < self->bhend; ++i) {
/* With growing blocksize almost no runs fail the next test. */
- if(likely(h % bs != bs - 1))
+ if(likely(h % SSDEEP_BS(i) != SSDEEP_BS(i) - 1))
/* Once this condition is false for one bs, it is
* automatically false for all further bs. I.e. if
* h === -1 (mod 2*bs) then h === -1 (mod bs). */
@@ -190,33 +183,25 @@ static int ssdeep_engine_step(struct ssdeep_context *self, unsigned char c) {
/* We have hit a reset point. We now emit hashes which are
* based on all characters in the piece of the message between
* the last reset point and this one */
- if(unlikely(0 == bh->dlen && bs < UINT32_MAX / 2)) {
+ if(unlikely(0 == self->bh[i].dlen)) {
/* Can only happen 30 times. */
/* First step for this blocksize. Clone next. */
- if(blockhash_fork(bh) < 0)
- return -1;
+ ssdeep_try_fork_blockhash(self);
}
- /* We don't need bs until next iteration. Update now, so we
- * don't forget about it. */
- bs *= 2;
- if(bh->dlen < SPAMSUM_LENGTH - 1) {
+ if(self->bh[i].dlen < SPAMSUM_LENGTH - 1) {
/* We can have a problem with the tail overflowing. The
* easiest way to cope with this is to only reset the
* normal hash if we have room for more characters in
* our signature. This has the effect of combining the
* last few pieces of the message into a single piece
* */
- bh->digest[bh->dlen++] = b64[bh->h % 64];
- bh->h = HASH_INIT;
- if(bh->dlen < SPAMSUM_LENGTH / 2)
- bh->halfh = HASH_INIT;
- } else {
- /* The reduction might free bh. */
- bh = bh->next;
+ self->bh[i].digest[self->bh[i].dlen++] =
+ b64[self->bh[i].h % 64];
+ self->bh[i].h = HASH_INIT;
+ if(self->bh[i].dlen < SPAMSUM_LENGTH / 2)
+ self->bh[i].halfh = HASH_INIT;
+ } else
ssdeep_try_reduce_blockhash(self);
- continue;
- }
- bh = bh->next;
}
++self->total_size;
return 0;
@@ -232,70 +217,62 @@ static int ssdeep_engine(struct ssdeep_context *self,
static int ssdeep_digest(const struct ssdeep_context *self,
/*@out@*/ char *result) {
- unsigned int bs = self->start_blocksize;
+ unsigned int bi = self->bhstart;
uint32_t h = roll_sum(&self->roll);
int i, remain = FUZZY_MAX_RESULT - 1;
- const struct blockhash_context *bh = self->blockhashes, *bhp;
/* Verify that our elimination was not overeager. */
- assert(bs == MIN_BLOCKSIZE ||
- (size_t)bs / 2 * SPAMSUM_LENGTH < self->total_size);
+ assert(bi == 0 || (size_t)SSDEEP_BS(bi) / 2 * SPAMSUM_LENGTH <
+ self->total_size);
/* Initial blocksize guess. */
- while((size_t)bs * SPAMSUM_LENGTH < self->total_size) {
- bs *= 2;
- if(bh->next == NULL)
+ while((size_t)SSDEEP_BS(bi) * SPAMSUM_LENGTH < self->total_size) {
+ ++bi;
+ if(bi >= self->bhend)
/* The input exceeds data types. */
return -1;
- bh = bh->next;
}
/* Adapt blocksize guess to actual digest length. */
- while(bs > self->start_blocksize && bh->dlen < SPAMSUM_LENGTH / 2) {
- bs /= 2;
- /* Slow version of bh = bh->prev; */
- for(bhp = self->blockhashes; bhp->next != bh; bhp = bhp->next)
- ;
- bh = bhp;
- }
- assert(bh != NULL);
- assert(!(bs > MIN_BLOCKSIZE && bh->dlen < SPAMSUM_LENGTH / 2));
+ while(bi > self->bhstart && self->bh[bi].dlen < SPAMSUM_LENGTH / 2)
+ --bi;
+ assert(!(bi > 0 && self->bh[bi].dlen < SPAMSUM_LENGTH / 2));
- i = snprintf(result, (size_t)remain, "%u:", bs);
+ i = snprintf(result, (size_t)remain, "%u:", SSDEEP_BS(bi));
if(i <= 0)
return -1;
assert(i < remain);
remain -= i;
result += i;
- i = (int)bh->dlen;
+ i = (int)self->bh[bi].dlen;
if(i > remain)
i = remain;
- memcpy(result, bh->digest, (size_t)i);
+ memcpy(result, self->bh[bi].digest, (size_t)i);
result += i;
remain -= i;
if(remain > 0 && h != 0) {
- *result++ = b64[bh->h % 64];
+ *result++ = b64[self->bh[bi].h % 64];
--remain;
}
if(remain > 0) {
*result++ = ':';
--remain;
}
- if(NULL != bh->next) {
- bh = bh->next;
- i = (int)bh->dlen;
+ if(bi < self->bhend - 1) {
+ ++bi;
+ i = (int)self->bh[bi].dlen;
if(i > SPAMSUM_LENGTH / 2 - 1)
i = SPAMSUM_LENGTH / 2 - 1;
if(i > remain)
i = remain;
- memcpy(result, bh->digest, (size_t)i);
+ memcpy(result, self->bh[bi].digest, (size_t)i);
result += i;
remain -= i;
if(remain > 0 && h != 0) {
- *result++ = b64[bh->halfh % 64];
+ *result++ = b64[self->bh[bi].halfh % 64];
--remain;
}
} else if(remain > 0 && h != 0) {
- assert(bh->dlen == 0);
- *result++ = b64[bh->h % 64];
+ assert(self->bh[bi].dlen == 0);
+ *result++ = b64[self->bh[bi].h % 64];
--remain;
}
*result = '\0';
@@ -303,13 +280,6 @@ static int ssdeep_digest(const struct ssdeep_context *self,
}
static void ssdeep_free(/*@only@*/ struct ssdeep_context *self) {
- struct blockhash_context *bh, *bhn;
- bh = self->blockhashes;
- while(bh) {
- bhn = bh->next;
- free(bh);
- bh = bhn;
- }
free(self);
}