properly set errno
[~helmut/ssdeep.git] / fuzzy.c
diff --git a/fuzzy.c b/fuzzy.c
index ed1e212..a9e6d4a 100644 (file)
--- a/fuzzy.c
+++ b/fuzzy.c
  */
 
 #include <assert.h>
+#include <errno.h>
 #include <stdint.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include "fuzzy.h"
 
 #if defined(__GNUC__) && __GNUC__ >= 3
 #define likely(x)       __builtin_expect(!!(x), 1)
 #define unlikely(x) x
 #endif
 
-#define SPAMSUM_LENGTH 64
-#define FUZZY_MAX_RESULT (SPAMSUM_LENGTH + (SPAMSUM_LENGTH/2 + 20))
 #define ROLLING_WINDOW 7
 #define MIN_BLOCKSIZE 3
 #define HASH_PRIME 0x01000193
 #define HASH_INIT 0x28021967
+#define NUM_BLOCKHASHES 31
 
 struct roll_state {
        unsigned char window[ROLLING_WINDOW];
@@ -90,259 +91,244 @@ static uint32_t sum_hash(unsigned char c, uint32_t h) {
 }
 
 /* A blockhash contains a signature state for a specific (implicit) blocksize.
- * The blocksize is given by the start_blocksize from ssdeep_context times two
- * to the power of the position of the element in the blockhashes member of
- * ssdeep_context (head position = 0). The h and halfh members are the FNV
- * hashes, where halfh stops to be reset after digest is SPAMSUM_LENGTH/2 long.
- * The halfh hash is needed be able to truncate digest for the second output
- * hash to stay compatible with ssdeep output. */
+ * The blocksize is given by SSDEEP_BS(index). The h and halfh members are the
+ * FNV hashes, where halfh stops to be reset after digest is SPAMSUM_LENGTH/2
+ * long. The halfh hash is needed be able to truncate digest for the second
+ * output hash to stay compatible with ssdeep output. */
 struct blockhash_context {
        uint32_t h, halfh;
        char digest[SPAMSUM_LENGTH];
        unsigned int dlen;
-       /*@null@*/ /*@only@*/ struct blockhash_context *next;
 };
 
-static int blockhash_fork(struct blockhash_context *bh) {
-       struct blockhash_context *nbh;
-       if(NULL == (nbh = malloc(sizeof(struct blockhash_context))))
-               return -1;
-       nbh->h = bh->h;
-       nbh->halfh = bh->halfh;
-       nbh->dlen = 0;
-       nbh->next = NULL;
-       assert(NULL == bh->next);
-       bh->next = nbh;
-       return 0;
-}
-
-struct ssdeep_context {
-       unsigned int start_blocksize;
-       /*@only@*/ struct blockhash_context *blockhashes;
+struct fuzzy_state {
+       unsigned int bhstart, bhend;
+       struct blockhash_context bh[NUM_BLOCKHASHES];
        size_t total_size;
        struct roll_state roll;
 };
 
-static int ssdeep_init(/*@out@*/ struct ssdeep_context *self) {
-       self->blockhashes = malloc(sizeof(struct blockhash_context));
-       if(NULL == self->blockhashes)
-               return -1;
-       self->start_blocksize = MIN_BLOCKSIZE;
-       self->blockhashes->h = HASH_INIT;
-       self->blockhashes->halfh = HASH_INIT;
-       self->blockhashes->dlen = 0;
-       self->blockhashes->next = NULL;
+#define SSDEEP_BS(index) (((uint32_t)MIN_BLOCKSIZE) << (index))
+
+/*@only@*/ /*@null@*/ struct fuzzy_state *fuzzy_new(void) {
+       struct fuzzy_state *self;
+       if(NULL == (self = malloc(sizeof(struct fuzzy_state))))
+               /* malloc sets ENOMEM */
+               return NULL;
+       self->bhstart = 0;
+       self->bhend = 1;
+       self->bh[0].h = HASH_INIT;
+       self->bh[0].halfh = HASH_INIT;
+       self->bh[0].dlen = 0;
        self->total_size = 0;
        roll_init(&self->roll);
-       return 0;
+       return self;
 }
 
-static void ssdeep_try_reduce_blockhash(struct ssdeep_context *self) {
-       struct blockhash_context *bh;
-       if(NULL == (bh = self->blockhashes->next))
-               /* Cannot remove last hash. */
+static void fuzzy_try_fork_blockhash(struct fuzzy_state *self) {
+       struct blockhash_context *obh, *nbh;
+       if(self->bhend >= NUM_BLOCKHASHES)
                return;
-       if((size_t)self->start_blocksize * SPAMSUM_LENGTH >= self->total_size)
+       assert(self->bhend > 0);
+       obh = self->bh + (self->bhend - 1);
+       nbh = obh + 1;
+       nbh->h = obh->h;
+       nbh->halfh = obh->halfh;
+       nbh->dlen = 0;
+       ++self->bhend;
+}
+
+static void fuzzy_try_reduce_blockhash(struct fuzzy_state *self) {
+       assert(self->bhstart < self->bhend);
+       if(self->bhend - self->bhstart < 2)
+               /* Need at least two working hashes. */
+               return;
+       if((size_t)SSDEEP_BS(self->bhstart) * SPAMSUM_LENGTH >=
+                       self->total_size)
                /* Initial blocksize estimate would select this or a smaller
                 * blocksize. */
                return;
-       if(bh->dlen < SPAMSUM_LENGTH / 2)
+       if(self->bh[self->bhstart + 1].dlen < SPAMSUM_LENGTH / 2)
                /* Estimate adjustment would select this blocksize. */
                return;
        /* At this point we are clearly no longer interested in the
         * start_blocksize. Get rid of it. */
-       self->start_blocksize *= 2;
-       free(self->blockhashes);
-       self->blockhashes = bh;
+       ++self->bhstart;
 }
 
 static const char *b64 =
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
-static int ssdeep_engine_step(struct ssdeep_context *self, unsigned char c) {
+static void fuzzy_engine_step(struct fuzzy_state *self, unsigned char c) {
        size_t h;
-       unsigned int bs;
-       struct blockhash_context *bh;
+       unsigned int i;
        /* At each character we update the rolling hash and the normal hashes.
         * When the rolling hash hits a reset value then we emit a normal hash
         * as a element of the signature and reset the normal hash. */
        roll_hash(&self->roll, c);
        h = roll_sum(&self->roll);
 
-       for(bh = self->blockhashes; NULL != bh; bh = bh->next) {
-               bh->h = sum_hash(c, bh->h);
-               bh->halfh = sum_hash(c, bh->halfh);
+       for(i = self->bhstart; i < self->bhend; ++i) {
+               self->bh[i].h = sum_hash(c, self->bh[i].h);
+               self->bh[i].halfh = sum_hash(c, self->bh[i].halfh);
        }
 
-       bs = self->start_blocksize;
-       bh = self->blockhashes;
-       while(bh != NULL) {
+       for(i = self->bhstart; i < self->bhend; ++i) {
                /* With growing blocksize almost no runs fail the next test. */
-               if(likely(h % bs != bs - 1))
+               if(likely(h % SSDEEP_BS(i) != SSDEEP_BS(i) - 1))
                        /* Once this condition is false for one bs, it is
                         * automatically false for all further bs. I.e. if
                         * h === -1 (mod 2*bs) then h === -1 (mod bs). */
                        break;
-               /* We don't need bs until next iteration. Update now, so we
-                * don't forget about it. */
-               bs *= 2;
                /* We have hit a reset point. We now emit hashes which are
                 * based on all characters in the piece of the message between
                 * the last reset point and this one */
-               if(unlikely(0 == bh->dlen)) /* Can only happen 30 times. */
+               if(unlikely(0 == self->bh[i].dlen)) {
+                       /* Can only happen 30 times. */
                        /* First step for this blocksize. Clone next. */
-                       if(blockhash_fork(bh) < 0)
-                               return -1;
-               if(bh->dlen < SPAMSUM_LENGTH - 1) {
+                       fuzzy_try_fork_blockhash(self);
+               }
+               if(self->bh[i].dlen < SPAMSUM_LENGTH - 1) {
                        /* We can have a problem with the tail overflowing. The
                         * easiest way to cope with this is to only reset the
                         * normal hash if we have room for more characters in
                         * our signature. This has the effect of combining the
                         * last few pieces of the message into a single piece
                         * */
-                       bh->digest[bh->dlen++] = b64[bh->h % 64];
-                       bh->h = HASH_INIT;
-                       if(bh->dlen < SPAMSUM_LENGTH / 2)
-                               bh->halfh = HASH_INIT;
-               } else {
-                       /* The reduction might free bh. */
-                       bh = bh->next;
-                       ssdeep_try_reduce_blockhash(self);
-                       continue;
-               }
-               bh = bh->next;
+                       self->bh[i].digest[self->bh[i].dlen++] =
+                               b64[self->bh[i].h % 64];
+                       self->bh[i].h = HASH_INIT;
+                       if(self->bh[i].dlen < SPAMSUM_LENGTH / 2)
+                               self->bh[i].halfh = HASH_INIT;
+               } else
+                       fuzzy_try_reduce_blockhash(self);
        }
-       ++self->total_size;
-       return 0;
 }
 
-static int ssdeep_engine(struct ssdeep_context *self,
-               const unsigned char *buffer, size_t buffer_size) {
+int fuzzy_update(struct fuzzy_state *self, const unsigned char *buffer,
+               size_t buffer_size) {
+       self->total_size += buffer_size;
        for( ;buffer_size > 0; ++buffer, --buffer_size)
-               if(ssdeep_engine_step(self, *buffer) < 0)
-                       return -1;
+               fuzzy_engine_step(self, *buffer);
        return 0;
 }
 
-static void ssdeep_digest(const struct ssdeep_context *self,
-               /*@out@*/ char *result) {
-       unsigned int bs = self->start_blocksize;
+int fuzzy_digest(const struct fuzzy_state *self, /*@out@*/ char *result) {
+       unsigned int bi = self->bhstart;
        uint32_t h = roll_sum(&self->roll);
        int i, remain = FUZZY_MAX_RESULT - 1;
-       const struct blockhash_context *bh = self->blockhashes, *bhp;
        /* Verify that our elimination was not overeager. */
-       assert(bs == MIN_BLOCKSIZE ||
-                       (size_t)bs / 2 * SPAMSUM_LENGTH < self->total_size);
+       assert(bi == 0 || (size_t)SSDEEP_BS(bi) / 2 * SPAMSUM_LENGTH <
+                       self->total_size);
 
        /* Initial blocksize guess. */
-       while((size_t)bs * SPAMSUM_LENGTH < self->total_size) {
-               bs *= 2;
-               assert(bh->next != NULL);
-               bh = bh->next;
+       while((size_t)SSDEEP_BS(bi) * SPAMSUM_LENGTH < self->total_size) {
+               ++bi;
+               if(bi >= NUM_BLOCKHASHES) {
+                       /* The input exceeds data types. */
+                       errno = EOVERFLOW;
+                       return -1;
+               }
        }
        /* Adapt blocksize guess to actual digest length. */
-       while(bs > self->start_blocksize && bh->dlen < SPAMSUM_LENGTH / 2) {
-               bs /= 2;
-               /* Slow version of bh = bh->prev; */
-               for(bhp = self->blockhashes; bhp->next != bh; bhp = bhp->next)
-                       ;
-               bh = bhp;
-       }
-       assert(bh != NULL);
-       assert(!(bs > MIN_BLOCKSIZE && bh->dlen < SPAMSUM_LENGTH / 2));
-
-       i = snprintf(result, (size_t)remain, "%u:", bs);
-       /* In theory snprintf can fail. It is unclear how though, so we assume
-        * that it doesn't for simplicity. */
-       assert(i > 0);
+       while(bi >= self->bhend)
+               --bi;
+       while(bi > self->bhstart && self->bh[bi].dlen < SPAMSUM_LENGTH / 2)
+               --bi;
+       assert(!(bi > 0 && self->bh[bi].dlen < SPAMSUM_LENGTH / 2));
+
+       i = snprintf(result, (size_t)remain, "%u:", SSDEEP_BS(bi));
+       if(i <= 0)
+               /* Maybe snprintf has set errno here? */
+               return -1;
        assert(i < remain);
        remain -= i;
        result += i;
-       i = (int)bh->dlen;
+       i = (int)self->bh[bi].dlen;
        if(i > remain)
                i = remain;
-       memcpy(result, bh->digest, (size_t)i);
+       memcpy(result, self->bh[bi].digest, (size_t)i);
        result += i;
        remain -= i;
        if(remain > 0 && h != 0) {
-               *result++ = b64[bh->h % 64];
+               *result++ = b64[self->bh[bi].h % 64];
                --remain;
        }
        if(remain > 0) {
                *result++ = ':';
                --remain;
        }
-       if(NULL != bh->next) {
-               bh = bh->next;
-               i = (int)bh->dlen;
+       if(bi < self->bhend - 1) {
+               ++bi;
+               i = (int)self->bh[bi].dlen;
                if(i > SPAMSUM_LENGTH / 2 - 1)
                        i = SPAMSUM_LENGTH / 2 - 1;
                if(i > remain)
                        i = remain;
-               memcpy(result, bh->digest, (size_t)i);
+               memcpy(result, self->bh[bi].digest, (size_t)i);
                result += i;
                remain -= i;
                if(remain > 0 && h != 0) {
-                       *result++ = b64[bh->halfh % 64];
+                       *result++ = b64[self->bh[bi].halfh % 64];
                        --remain;
                }
        } else if(remain > 0 && h != 0) {
-               assert(bh->dlen == 0);
-               *result++ = b64[bh->h % 64];
+               assert(self->bh[bi].dlen == 0);
+               *result++ = b64[self->bh[bi].h % 64];
                --remain;
        }
        *result = '\0';
+       return 0;
 }
 
-static void ssdeep_clear(struct ssdeep_context *self) {
-       struct blockhash_context *bh, *bhn;
-       bh = self->blockhashes;
-       while(bh) {
-               bhn = bh->next;
-               free(bh);
-               bh = bhn;
-       }
+void fuzzy_free(/*@only@*/ struct fuzzy_state *self) {
+       free(self);
 }
 
 int fuzzy_hash_buf(const unsigned char *buf, uint32_t buf_len,
                /*@out@*/ char *result) {
-       struct ssdeep_context ctx[1];
-       if(ssdeep_init(ctx) < 0)
-               return -1;
-       if(ssdeep_engine(ctx, buf, buf_len) < 0) {
-               ssdeep_clear(ctx);
+       struct fuzzy_state *ctx;
+       int ret = -1;
+       if(NULL == (ctx = fuzzy_new()))
                return -1;
-       }
-       ssdeep_digest(ctx, result);
-       ssdeep_clear(ctx);
-       return 0;
+       if(fuzzy_update(ctx, buf, buf_len) < 0)
+               goto out;
+       if(fuzzy_digest(ctx, result) < 0)
+               goto out;
+       ret = 0;
+out:
+       fuzzy_free(ctx);
+       return ret;
 }
 
 int fuzzy_hash_stream(FILE *handle, /*@out@*/ char *result) {
-       struct ssdeep_context ctx[1];
+       struct fuzzy_state *ctx;
        unsigned char buffer[4096];
        size_t n;
-       if(ssdeep_init(ctx) < 0)
+       int ret = -1;
+       if(NULL == (ctx = fuzzy_new()))
                return -1;
        for(;;) {
                n = fread(buffer, 1, 4096, handle);
                if(0 == n)
                        break;
-               if(ssdeep_engine(ctx, buffer, n) < 0)
-                       goto errout;
+               if(fuzzy_update(ctx, buffer, n) < 0)
+                       goto out;
        }
        if(ferror(handle) != 0)
-               goto errout;
-       ssdeep_digest(ctx, result);
-       ssdeep_clear(ctx);
-       return 0;
-errout:
-       ssdeep_clear(ctx);
-       return -1;
+               goto out;
+       if(fuzzy_digest(ctx, result) < 0)
+               goto out;
+       ret = 0;
+out:
+       fuzzy_free(ctx);
+       return ret;
 }
 
 #ifdef S_SPLINT_S
 typedef size_t off_t;
+int fseeko(FILE *, off_t, int);
+off_t ftello(FILE *);
 #endif
 
 int fuzzy_hash_file(FILE *handle, /*@out@*/ char *result) {
@@ -369,25 +355,3 @@ int fuzzy_hash_filename(const char *filename, /*@out@*/ char *result) {
        return status;
 }
 
-#ifdef S_SPLINT_S
-extern /*@only@*/ /*@null@*/ char *realpath(const char *,
-               /*@null@*/ const char *);
-#endif
-
-int main(int argc, char **argv) {
-       char digest[FUZZY_MAX_RESULT], *p;
-       int i;
-       puts("ssdeep,1.1--blocksize:hash:hash,filename");
-       for(i = 1; i < argc; ++i) {
-               p = realpath(argv[i], NULL);
-               if(p == NULL)
-                       return 1;
-               if(fuzzy_hash_filename(argv[i], digest) < 0) {
-                       free(p);
-                       return 1;
-               }
-               printf("%s,\"%s\"\n", digest, p);
-               free(p);
-       }
-       return 0;
-}