diff options
Diffstat (limited to 'dedup')
-rw-r--r-- | dedup/image.py | 67 |
1 files changed, 67 insertions, 0 deletions
diff --git a/dedup/image.py b/dedup/image.py new file mode 100644 index 0000000..e05e7da --- /dev/null +++ b/dedup/image.py @@ -0,0 +1,67 @@ +import io +import struct + +import PIL.Image + +class ImageHash(object): + """A hash on the contents of an image. This disregards mode, depth and meta + information. Note that due to limitations in PIL and the image format + (interlacing) the full contents are stored and decoded in hexdigest.""" + maxsize = 1024 * 1024 * 32 + # max memory usage is about 5 * maxpixels in bytes + maxpixels = 1024 * 1024 * 32 + + def __init__(self, hashobj): + """ + @param hashobj: a hashlib-like object + """ + self.hashobj = hashobj + self.imagedetected = False + self.content = io.BytesIO() + + def update(self, data): + self.content.write(data) + if self.content.tell() > self.maxsize: + raise ValueError("maximum image size exceeded") + if self.imagedetected: + return + if self.content.tell() < 33: # header + IHDR + return + curvalue = self.content.getvalue() + if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): + width, height = struct.unpack(">II", curvalue[16:24]) + if width * height > self.maxpixels: + raise ValueError("maximum image pixels exceeded") + self.imagedetected = True + return + raise ValueError("not a png image") + + def copy(self): + new = ImageHash() + new.hashobj = self.hashobj.copy() + new.imagedetected = self.imagedetected + new.content = io.BytesIO(self.content.getvalue()) + return new + + def hexdigest(self): + if not self.imagedetected: + raise ValueError("not a png image") + hashobj = self.hashobj.copy() + pos = self.content.tell() + try: + self.content.seek(0) + img = PIL.Image.open(self.content) + width, height = img.size + pack = lambda elem: struct.pack("BBBB", *elem) + # special casing easy modes reduces memory usage + if img.mode == "L": + pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255) + elif img.mode == "RGB": + pack = lambda elem: struct.pack("BBBB", *(elem + (255,))) + elif img.mode != "RGBA": + img = img.convert("RGBA") + for elem in img.getdata(): + hashobj.update(pack(elem)) + finally: + self.content.seek(pos) + return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height) |