From 0e690a1f5e32d1e16ad27dd96cb43b78d5d36fb4 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Fri, 8 Mar 2013 16:33:09 +0100 Subject: generalize ImageHash to StoredHash --- dedup/image.py | 78 +++++++++++++++++++++++----------------------------------- 1 file changed, 31 insertions(+), 47 deletions(-) (limited to 'dedup/image.py') diff --git a/dedup/image.py b/dedup/image.py index e05e7da..78f8bd9 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -1,9 +1,20 @@ -import io import struct import PIL.Image -class ImageHash(object): +from . import hashing + +def detectpng(content, maxpixels=1024 * 1024 * 32): + if len(content) < 33: + return None # defer decision + if not content.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): + return False + width, height = struct.unpack(">II", content[16:24]) + if width * height > maxpixels: + return False + return True + +class ImageHash(hashing.StoredHash): """A hash on the contents of an image. This disregards mode, depth and meta information. Note that due to limitations in PIL and the image format (interlacing) the full contents are stored and decoded in hexdigest.""" @@ -16,52 +27,25 @@ class ImageHash(object): @param hashobj: a hashlib-like object """ self.hashobj = hashobj - self.imagedetected = False - self.content = io.BytesIO() - - def update(self, data): - self.content.write(data) - if self.content.tell() > self.maxsize: - raise ValueError("maximum image size exceeded") - if self.imagedetected: - return - if self.content.tell() < 33: # header + IHDR - return - curvalue = self.content.getvalue() - if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): - width, height = struct.unpack(">II", curvalue[16:24]) - if width * height > self.maxpixels: - raise ValueError("maximum image pixels exceeded") - self.imagedetected = True - return - raise ValueError("not a png image") + hashing.StoredHash.__init__(self, self.computehash, self.detect, + self.maxsize) + self.hashobj = hashobj - def copy(self): - new = ImageHash() - new.hashobj = self.hashobj.copy() - new.imagedetected = self.imagedetected - new.content = io.BytesIO(self.content.getvalue()) - return new + def detect(self, bytesio): + return detectpng(bytesio.read(33), self.maxpixels) - def hexdigest(self): - if not self.imagedetected: - raise ValueError("not a png image") + def computehash(self, bytesio): hashobj = self.hashobj.copy() - pos = self.content.tell() - try: - self.content.seek(0) - img = PIL.Image.open(self.content) - width, height = img.size - pack = lambda elem: struct.pack("BBBB", *elem) - # special casing easy modes reduces memory usage - if img.mode == "L": - pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255) - elif img.mode == "RGB": - pack = lambda elem: struct.pack("BBBB", *(elem + (255,))) - elif img.mode != "RGBA": - img = img.convert("RGBA") - for elem in img.getdata(): - hashobj.update(pack(elem)) - finally: - self.content.seek(pos) + img = PIL.Image.open(self.content) + width, height = img.size + pack = lambda elem: struct.pack("BBBB", *elem) + # special casing easy modes reduces memory usage + if img.mode == "L": + pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255) + elif img.mode == "RGB": + pack = lambda elem: struct.pack("BBBB", *(elem + (255,))) + elif img.mode != "RGBA": + img = img.convert("RGBA") + for elem in img.getdata(): + hashobj.update(pack(elem)) return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height) -- cgit v1.2.3