summaryrefslogtreecommitdiff
path: root/dedup/image.py
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-08-02 08:40:49 +0200
committerHelmut Grohne <helmut@subdivi.de>2013-08-02 08:40:49 +0200
commitcb3708825bf7ea32314040575cef35980dad0cd8 (patch)
tree31575a8525dc90ba6904268d94f47e1604bf0557 /dedup/image.py
parenta4bbbb6e664e605634cb3f9e0564c7e4a93697be (diff)
parent2712edb550968ce7ec8cd9800241d7944666631a (diff)
downloaddebian-dedup-cb3708825bf7ea32314040575cef35980dad0cd8.tar.gz
Merge branch master into sqlalchemy
This makes the sqlalchemy branch schema-compatible with master again. The biggest change on master was the introduction of the function table. It caused most of the conflicts. Note that webapp had one conflict not detected by git: The selecting of issues in show_package needed sqlalchemy conversion. Conflicts: README update_sharing.py webapp.py
Diffstat (limited to 'dedup/image.py')
-rw-r--r--dedup/image.py67
1 files changed, 45 insertions, 22 deletions
diff --git a/dedup/image.py b/dedup/image.py
index 1148890..c1f2de0 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -4,9 +4,10 @@ import struct
import PIL.Image
class ImageHash(object):
- """A hash on the contents of an image. This disregards mode, depth and meta
- information. Note that due to limitations in PIL and the image format
- (interlacing) the full contents are stored and decoded in hexdigest."""
+ """A hash on the contents of an image datat type supported by PIL. This
+ disregards mode, depth and meta information. Note that due to limitations
+ in PIL and the image format (interlacing) the full contents are stored and
+ decoded in hexdigest."""
maxsize = 1024 * 1024 * 32
# max memory usage is about 5 * maxpixels in bytes
maxpixels = 1024 * 1024 * 32
@@ -19,33 +20,25 @@ class ImageHash(object):
self.imagedetected = False
self.content = io.BytesIO()
+ def detect(self):
+ raise NotImplementedError
+
def update(self, data):
self.content.write(data)
if self.content.tell() > self.maxsize:
raise ValueError("maximum image size exceeded")
- if self.imagedetected:
- return
- if self.content.tell() < 33: # header + IHDR
- return
- curvalue = self.content.getvalue()
- if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
- width, height = struct.unpack(">II", curvalue[16:24])
- if width * height > self.maxpixels:
- raise ValueError("maximum image pixels exceeded")
- self.imagedetected = True
- return
- raise ValueError("not a png image")
+ if not self.imagedetected:
+ self.imagedetected = self.detect()
def copy(self):
- new = ImageHash()
- new.hashobj = self.hashobj.copy()
+ new = self.__class__(self.hashobj.copy())
new.imagedetected = self.imagedetected
new.content = io.BytesIO(self.content.getvalue())
return new
def hexdigest(self):
if not self.imagedetected:
- raise ValueError("not a png image")
+ raise ValueError("not a image")
hashobj = self.hashobj.copy()
pos = self.content.tell()
try:
@@ -53,7 +46,7 @@ class ImageHash(object):
try:
img = PIL.Image.open(self.content)
except IOError:
- raise ValueError("broken png header")
+ raise ValueError("broken header")
width, height = img.size
pack = lambda elem: struct.pack("BBBB", *elem)
# special casing easy modes reduces memory usage
@@ -64,13 +57,43 @@ class ImageHash(object):
elif img.mode != "RGBA":
try:
img = img.convert("RGBA")
- except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
- raise ValueError("error reading png image")
+ except (SyntaxError, IndexError, IOError):
+ # crazy stuff from PIL
+ raise ValueError("error reading image")
try:
for elem in img.getdata():
hashobj.update(pack(elem))
except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
- raise ValueError("error reading png image")
+ raise ValueError("error reading image")
finally:
self.content.seek(pos)
return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)
+
+
+class PNGHash(ImageHash):
+ """A hash on the contents of a PNG image."""
+
+ def detect(self):
+ if self.content.tell() < 33: # header + IHDR
+ return False
+ curvalue = self.content.getvalue()
+ if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
+ width, height = struct.unpack(">II", curvalue[16:24])
+ if width * height > self.maxpixels:
+ raise ValueError("maximum image pixels exceeded")
+ return True
+ raise ValueError("not a png image")
+
+class GIFHash(ImageHash):
+ """A hash on the contents of the first frame of a GIF image."""
+
+ def detect(self):
+ if self.content.tell() < 10: # magic + logical dimension
+ return False
+ curvalue = self.content.getvalue()
+ if curvalue.startswith((b"GIF87a", "GIF89a")):
+ width, height = struct.unpack("<HH", curvalue[6:10])
+ if width * height > self.maxpixels:
+ raise ValueError("maximum image pixels exceeded")
+ return True
+ raise ValueError("not a png image")