dedup/image.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111

import io
import struct

import PIL.Image

from .hashing import HashlibLike

class ImageHash:
    """A hash on the contents of an image data type supported by PIL. This
    disregards mode, depth and meta information. Note that due to limitations
    in PIL and the image format (interlacing) the full contents are stored and
    decoded in hexdigest."""
    maxsize = 1024 * 1024 * 32
    # max memory usage is about 5 * maxpixels in bytes
    maxpixels = 1024 * 1024 * 32
    name_prefix: str

    def __init__(self, hashobj: HashlibLike) -> None:
        """
        @param hashobj: a hashlib-like object
        """
        self.hashobj = hashobj
        self.imagedetected = False
        self.content = io.BytesIO()

    def detect(self) -> bool:
        raise NotImplementedError

    def update(self, data: bytes) -> None:
        self.content.write(data)
        if self.content.tell() > self.maxsize:
            raise ValueError("maximum image size exceeded")
        if not self.imagedetected:
            self.imagedetected = self.detect()

    def copy(self) -> "ImageHash":
        new = self.__class__(self.hashobj.copy())
        new.imagedetected = self.imagedetected
        new.content = io.BytesIO(self.content.getvalue())
        return new

    def digest(self) -> bytes:
        raise ValueError("an ImageHash cannot produce a raw digest")

    def hexdigest(self) -> str:
        if not self.imagedetected:
            raise ValueError("not a image")
        hashobj = self.hashobj.copy()
        pos = self.content.tell()
        try:
            self.content.seek(0)
            try:
                img = PIL.Image.open(self.content)
            except IOError:
                raise ValueError("broken header")
            width, height = img.size
            pack = lambda elem: struct.pack("BBBB", *elem)
            # special casing easy modes reduces memory usage
            if img.mode == "L":
                pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255)
            elif img.mode == "RGB":
                pack = lambda elem: struct.pack("BBBB", *(elem + (255,)))
            elif img.mode != "RGBA":
                try:
                    img = img.convert("RGBA")
                except (SyntaxError, IndexError, IOError):
                    # crazy stuff from PIL
                    raise ValueError("error reading image")
            try:
                for elem in img.getdata():
                    hashobj.update(pack(elem))
            except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
                raise ValueError("error reading image")
        finally:
            self.content.seek(pos)
        return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)

    @property
    def name(self) -> str:
        return self.name_prefix + self.hashobj.name


class PNGHash(ImageHash):
    """A hash on the contents of a PNG image."""
    name_prefix = "png_"

    def detect(self) -> bool:
        if self.content.tell() < 33: # header + IHDR
            return False
        curvalue = self.content.getvalue()
        if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
            width, height = struct.unpack(">II", curvalue[16:24])
            if width * height > self.maxpixels:
                raise ValueError("maximum image pixels exceeded")
            return True
        raise ValueError("not a png image")

class GIFHash(ImageHash):
    """A hash on the contents of the first frame of a GIF image."""
    name_prefix = "gif_"

    def detect(self) -> bool:
        if self.content.tell() < 10: # magic + logical dimension
            return False
        curvalue = self.content.getvalue()
        if curvalue.startswith((b"GIF87a", b"GIF89a")):
            width, height = struct.unpack("<HH", curvalue[6:10])
            if width * height > self.maxpixels:
                raise ValueError("maximum image pixels exceeded")
            return True
        raise ValueError("not a png image")