dedup/image.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73

import io
import struct

import PIL.Image

class ImageHash(object):
    """A hash on the contents of an image. This disregards mode, depth and meta
    information. Note that due to limitations in PIL and the image format
    (interlacing) the full contents are stored and decoded in hexdigest."""
    maxsize = 1024 * 1024 * 32
    # max memory usage is about 5 * maxpixels in bytes
    maxpixels = 1024 * 1024 * 32

    def __init__(self, hashobj):
        """
        @param hashobj: a hashlib-like object
        """
        self.hashobj = hashobj
        self.imagedetected = False
        self.content = io.BytesIO()

    def update(self, data):
        self.content.write(data)
        if self.content.tell() > self.maxsize:
            raise ValueError("maximum image size exceeded")
        if self.imagedetected:
            return
        if self.content.tell() < 33: # header + IHDR
            return
        curvalue = self.content.getvalue()
        if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
            width, height = struct.unpack(">II", curvalue[16:24])
            if width * height > self.maxpixels:
                raise ValueError("maximum image pixels exceeded")
            self.imagedetected = True
            return
        raise ValueError("not a png image")

    def copy(self):
        new = ImageHash()
        new.hashobj = self.hashobj.copy()
        new.imagedetected = self.imagedetected
        new.content = io.BytesIO(self.content.getvalue())
        return new

    def hexdigest(self):
        if not self.imagedetected:
            raise ValueError("not a png image")
        hashobj = self.hashobj.copy()
        pos = self.content.tell()
        try:
            self.content.seek(0)
            try:
                img = PIL.Image.open(self.content)
            except IOError:
                raise ValueError("broken png header")
            width, height = img.size
            pack = lambda elem: struct.pack("BBBB", *elem)
            # special casing easy modes reduces memory usage
            if img.mode == "L":
                pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255)
            elif img.mode == "RGB":
                pack = lambda elem: struct.pack("BBBB", *(elem + (255,)))
            elif img.mode != "RGBA":
                img = img.convert("RGBA")
            try:
                for elem in img.getdata():
                    hashobj.update(pack(elem))
            except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
                raise ValueError("error reading png image")
        finally:
            self.content.seek(pos)
        return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)