summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-03-08 16:33:37 +0100
committerHelmut Grohne <helmut@subdivi.de>2013-03-08 16:33:37 +0100
commitd017cd116e3666a8a0e22cd2c0b40af45514fa0c (patch)
treec5f2dd65ae5edc4413fa32a0d6b4084090036ecd
parent0e690a1f5e32d1e16ad27dd96cb43b78d5d36fb4 (diff)
downloaddebian-dedup-d017cd116e3666a8a0e22cd2c0b40af45514fa0c.tar.gz
importpkg: support ssdeep hash
-rw-r--r--README2
-rwxr-xr-ximportpkg.py23
2 files changed, 22 insertions, 3 deletions
diff --git a/README b/README
index bc1b715..4e2083c 100644
--- a/README
+++ b/README
@@ -3,6 +3,8 @@ Required packages
aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging
+ Optional: https://pypi.python.org/pypi/ssdeep
+
Create a database
-----------------
The database name is currently hardcoded as `test.sqlite3`. So copy the SQL
diff --git a/importpkg.py b/importpkg.py
index d63b85e..84cdad1 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -11,7 +11,12 @@ from debian.debian_support import version_compare
from debian import deb822
import lzma
-from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
+try:
+ import ssdeep
+except ImportError:
+ ssdeep = None
+
+from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, StoredHash, hash_file
from dedup.compression import GzipDecompressor, DecompressedStream
from dedup.image import ImageHash
@@ -96,12 +101,24 @@ def imagehash():
hashobj.name = "image_sha512"
return hashobj
+if ssdeep is None:
+ def multihash():
+ return MultiHash(sha512_nontrivial(), gziphash(), imagehash())
+else:
+ def ssdeephash():
+ hashobj = StoredHash(lambda bytesio: ssdeep.hash(bytesio.getvalue()))
+ hashobj.name = "ssdeep"
+ return hashobj
+
+ def multihash():
+ return MultiHash(sha512_nontrivial(), gziphash(), imagehash(),
+ ssdeephash())
+
def get_hashes(tar):
for elem in tar:
if not elem.isreg(): # excludes hard links as well
continue
- hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
- hasher = hash_file(hasher, tar.extractfile(elem))
+ hasher = hash_file(multihash(), tar.extractfile(elem))
for hashobj in hasher.hashes:
hashvalue = hashobj.hexdigest()
if hashvalue: