dedup/image.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51

import struct

import PIL.Image

from . import hashing

def detectpng(content, maxpixels=1024 * 1024 * 32):
    if len(content) < 33:
        return None # defer decision
    if not content.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
        return False
    width, height = struct.unpack(">II", content[16:24])
    if width * height > maxpixels:
        return False
    return True

class ImageHash(hashing.StoredHash):
    """A hash on the contents of an image. This disregards mode, depth and meta
    information. Note that due to limitations in PIL and the image format
    (interlacing) the full contents are stored and decoded in hexdigest."""
    maxsize = 1024 * 1024 * 32
    # max memory usage is about 5 * maxpixels in bytes
    maxpixels = 1024 * 1024 * 32

    def __init__(self, hashobj):
        """
        @param hashobj: a hashlib-like object
        """
        self.hashobj = hashobj
        hashing.StoredHash.__init__(self, self.computehash, self.detect,
                                    self.maxsize)
        self.hashobj = hashobj

    def detect(self, bytesio):
        return detectpng(bytesio.read(33), self.maxpixels)

    def computehash(self, bytesio):
        hashobj = self.hashobj.copy()
        img = PIL.Image.open(self.content)
        width, height = img.size
        pack = lambda elem: struct.pack("BBBB", *elem)
        # special casing easy modes reduces memory usage
        if img.mode == "L":
            pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255)
        elif img.mode == "RGB":
            pack = lambda elem: struct.pack("BBBB", *(elem + (255,)))
        elif img.mode != "RGBA":
            img = img.convert("RGBA")
        for elem in img.getdata():
            hashobj.update(pack(elem))
        return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)