summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README2
-rw-r--r--dedup/image.py67
-rwxr-xr-ximportpkg.py9
3 files changed, 76 insertions, 2 deletions
diff --git a/README b/README
index 7bc4517..aff9868 100644
--- a/README
+++ b/README
@@ -1,7 +1,7 @@
Required packages
-----------------
-aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3
+aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging
Create a database
-----------------
diff --git a/dedup/image.py b/dedup/image.py
new file mode 100644
index 0000000..e05e7da
--- /dev/null
+++ b/dedup/image.py
@@ -0,0 +1,67 @@
+import io
+import struct
+
+import PIL.Image
+
+class ImageHash(object):
+ """A hash on the contents of an image. This disregards mode, depth and meta
+ information. Note that due to limitations in PIL and the image format
+ (interlacing) the full contents are stored and decoded in hexdigest."""
+ maxsize = 1024 * 1024 * 32
+ # max memory usage is about 5 * maxpixels in bytes
+ maxpixels = 1024 * 1024 * 32
+
+ def __init__(self, hashobj):
+ """
+ @param hashobj: a hashlib-like object
+ """
+ self.hashobj = hashobj
+ self.imagedetected = False
+ self.content = io.BytesIO()
+
+ def update(self, data):
+ self.content.write(data)
+ if self.content.tell() > self.maxsize:
+ raise ValueError("maximum image size exceeded")
+ if self.imagedetected:
+ return
+ if self.content.tell() < 33: # header + IHDR
+ return
+ curvalue = self.content.getvalue()
+ if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
+ width, height = struct.unpack(">II", curvalue[16:24])
+ if width * height > self.maxpixels:
+ raise ValueError("maximum image pixels exceeded")
+ self.imagedetected = True
+ return
+ raise ValueError("not a png image")
+
+ def copy(self):
+ new = ImageHash()
+ new.hashobj = self.hashobj.copy()
+ new.imagedetected = self.imagedetected
+ new.content = io.BytesIO(self.content.getvalue())
+ return new
+
+ def hexdigest(self):
+ if not self.imagedetected:
+ raise ValueError("not a png image")
+ hashobj = self.hashobj.copy()
+ pos = self.content.tell()
+ try:
+ self.content.seek(0)
+ img = PIL.Image.open(self.content)
+ width, height = img.size
+ pack = lambda elem: struct.pack("BBBB", *elem)
+ # special casing easy modes reduces memory usage
+ if img.mode == "L":
+ pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255)
+ elif img.mode == "RGB":
+ pack = lambda elem: struct.pack("BBBB", *(elem + (255,)))
+ elif img.mode != "RGBA":
+ img = img.convert("RGBA")
+ for elem in img.getdata():
+ hashobj.update(pack(elem))
+ finally:
+ self.content.seek(pos)
+ return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)
diff --git a/importpkg.py b/importpkg.py
index 5901b57..d626fba 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -20,6 +20,7 @@ import lzma
from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
from dedup.compression import GzipDecompressor, DecompressedStream
+from dedup.image import ImageHash
class ArReader(object):
global_magic = b"!<arch>\n"
@@ -96,11 +97,17 @@ def gziphash():
hashobj.name = "gzip_sha512"
return HashBlacklist(hashobj, boring_sha512_hashes)
+def imagehash():
+ hashobj = ImageHash(hashlib.sha512())
+ hashobj = SuppressingHash(hashobj, (ValueError,))
+ hashobj.name = "image_sha512"
+ return hashobj
+
def get_hashes(tar):
for elem in tar:
if not elem.isreg(): # excludes hard links as well
continue
- hasher = MultiHash(sha512_nontrivial(), gziphash())
+ hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
hasher = hash_file(hasher, tar.extractfile(elem))
for hashobj in hasher.hashes:
hashvalue = hashobj.hexdigest()