summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-03-08 16:33:09 +0100
committerHelmut Grohne <helmut@subdivi.de>2013-03-08 16:33:09 +0100
commit0e690a1f5e32d1e16ad27dd96cb43b78d5d36fb4 (patch)
treead0e188b62806ecc7d2d806f1f8958b0a1fd8bbf
parent5b5cf7f2629c3a6c78f6057ff1e8476ff001409f (diff)
downloaddebian-dedup-0e690a1f5e32d1e16ad27dd96cb43b78d5d36fb4.tar.gz
generalize ImageHash to StoredHash
-rw-r--r--dedup/filters.py50
-rw-r--r--dedup/hashing.py60
-rw-r--r--dedup/image.py78
3 files changed, 141 insertions, 47 deletions
diff --git a/dedup/filters.py b/dedup/filters.py
new file mode 100644
index 0000000..c5b3251
--- /dev/null
+++ b/dedup/filters.py
@@ -0,0 +1,50 @@
+import struct
+
+class PNGFilter:
+ """Skips non-critical chunks in a PNG file."""
+ magic = b"\x89PNG\r\n\x1a\n"
+ def __init__(self):
+ self.inbuffer = b""
+ self.critchunk = False
+ self.chunkleft = None
+
+ def filter(self, data):
+ self.inbuffer += data
+ if self.chunkleft is None:
+ if len(self.inbuffer) < 8:
+ return b""
+ if not self.inbuffer.startswith(self.magic):
+ raise ValueError("PNG file magic not found")
+ self.inbuffer = self.inbuffer[8:]
+ self.chunkleft = 0
+ ret = b""
+ while True:
+ if self.chunkleft == 0:
+ if len(self.inbuffer) < 8:
+ break
+ self.chunkleft, chunktype = struct.unpack(">I4s",
+ self.inbuffer[:8])
+ self.chunkleft += 12 # len, type, crc
+ self.critchunk = chunktype[0].isupper()
+ if self.critchunk:
+ print("critical chunk %s %r" % (chunktype, self.inbuffer[8:16]))
+ n = min(self.chunkleft, len(self.inbuffer))
+ if self.critchunk:
+ ret += self.inbuffer[:n]
+ self.inbuffer = self.inbuffer[n:]
+ self.chunkleft -= n
+ if self.chunkleft:
+ break
+ return ret
+
+ def flush(self):
+ ret = self.inbuffer
+ self.inbuffer = b""
+ return ret
+
+ def copy(self):
+ new = PNGFilter()
+ new.inbuffer = self.inbuffer
+ new.critchunk = self.critchunk
+ new.chunkleft = self.chunkleft
+ return new
diff --git a/dedup/hashing.py b/dedup/hashing.py
index 1283c7e..0c786e1 100644
--- a/dedup/hashing.py
+++ b/dedup/hashing.py
@@ -1,3 +1,6 @@
+import io
+import os
+
class HashBlacklist(object):
"""Turn a hashlib-like object into a hash that returns None for some
blacklisted hashes instead of the real hash value.
@@ -96,6 +99,63 @@ class SuppressingHash(object):
return SuppressingHash(self.hashobj.copy(), self.exceptions)
return SuppressingHash(None, self.exceptions)
+class StoredHash(object):
+ """A hash that stores all the updates and then does all the work on the
+ hexdigest call."""
+
+ def __init__(self, digestfunc, acceptfunc=lambda _: True,
+ sizelimit=1024*1024*16):
+ """
+ @type digestfunc: file-like -> str
+ @param digestfunc: should read the given file-like and return
+ the computed hash. The file-like can be assumed to be seekable.
+ @type acceptfunc: file-like -> bool or None
+ should read enough of file-like to determine whether the hash
+ is computable. To accept the hash, return True. To reject
+ producing a hash return False. To defer the decision until more
+ data is available return None.
+ @type sizelimit: int
+ @param sizelimit: when the content exceeds this size, reject it
+ """
+ self.digestfunc = digestfunc
+ self.acceptfunc = acceptfunc
+ self.sizelimit = sizelimit
+ self.content = io.BytesIO()
+ self.accepted = False
+
+ def update(self, data):
+ if self.content is None or not data:
+ return
+ self.content.seek(0, os.SEEK_END)
+ if self.content.tell() + len(data) > self.sizelimit:
+ self.content = None
+ return
+ self.content.write(data)
+ if not self.accepted:
+ self.content.seek(0, os.SEEK_SET)
+ ret = self.acceptfunc(self.content)
+ if ret is None:
+ return
+ if ret:
+ self.accepted = True
+ else:
+ self.content = None
+
+ def hexdigest(self):
+ if not self.content or not self.accepted:
+ return None
+ self.content.seek(0, os.SEEK_SET)
+ return self.digestfunc(self.content)
+
+ def copy(self):
+ new = StoredHash(self.digestfunc, self.acceptfunc, self.sizelimit)
+ if self.content:
+ new.content = io.BytesIO(self.content.getvalue())
+ else:
+ new.content = None
+ new.accepted = self.accepted
+ return new
+
def hash_file(hashobj, filelike, blocksize=65536):
"""Feed the entire contents from the given filelike to the given hashobj.
@param hashobj: hashlib-like object providing an update method
diff --git a/dedup/image.py b/dedup/image.py
index e05e7da..78f8bd9 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -1,9 +1,20 @@
-import io
import struct
import PIL.Image
-class ImageHash(object):
+from . import hashing
+
+def detectpng(content, maxpixels=1024 * 1024 * 32):
+ if len(content) < 33:
+ return None # defer decision
+ if not content.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
+ return False
+ width, height = struct.unpack(">II", content[16:24])
+ if width * height > maxpixels:
+ return False
+ return True
+
+class ImageHash(hashing.StoredHash):
"""A hash on the contents of an image. This disregards mode, depth and meta
information. Note that due to limitations in PIL and the image format
(interlacing) the full contents are stored and decoded in hexdigest."""
@@ -16,52 +27,25 @@ class ImageHash(object):
@param hashobj: a hashlib-like object
"""
self.hashobj = hashobj
- self.imagedetected = False
- self.content = io.BytesIO()
-
- def update(self, data):
- self.content.write(data)
- if self.content.tell() > self.maxsize:
- raise ValueError("maximum image size exceeded")
- if self.imagedetected:
- return
- if self.content.tell() < 33: # header + IHDR
- return
- curvalue = self.content.getvalue()
- if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
- width, height = struct.unpack(">II", curvalue[16:24])
- if width * height > self.maxpixels:
- raise ValueError("maximum image pixels exceeded")
- self.imagedetected = True
- return
- raise ValueError("not a png image")
+ hashing.StoredHash.__init__(self, self.computehash, self.detect,
+ self.maxsize)
+ self.hashobj = hashobj
- def copy(self):
- new = ImageHash()
- new.hashobj = self.hashobj.copy()
- new.imagedetected = self.imagedetected
- new.content = io.BytesIO(self.content.getvalue())
- return new
+ def detect(self, bytesio):
+ return detectpng(bytesio.read(33), self.maxpixels)
- def hexdigest(self):
- if not self.imagedetected:
- raise ValueError("not a png image")
+ def computehash(self, bytesio):
hashobj = self.hashobj.copy()
- pos = self.content.tell()
- try:
- self.content.seek(0)
- img = PIL.Image.open(self.content)
- width, height = img.size
- pack = lambda elem: struct.pack("BBBB", *elem)
- # special casing easy modes reduces memory usage
- if img.mode == "L":
- pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255)
- elif img.mode == "RGB":
- pack = lambda elem: struct.pack("BBBB", *(elem + (255,)))
- elif img.mode != "RGBA":
- img = img.convert("RGBA")
- for elem in img.getdata():
- hashobj.update(pack(elem))
- finally:
- self.content.seek(pos)
+ img = PIL.Image.open(self.content)
+ width, height = img.size
+ pack = lambda elem: struct.pack("BBBB", *elem)
+ # special casing easy modes reduces memory usage
+ if img.mode == "L":
+ pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255)
+ elif img.mode == "RGB":
+ pack = lambda elem: struct.pack("BBBB", *(elem + (255,)))
+ elif img.mode != "RGBA":
+ img = img.convert("RGBA")
+ for elem in img.getdata():
+ hashobj.update(pack(elem))
return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)