diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-03-08 16:33:09 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-03-08 16:33:09 +0100 |
commit | 0e690a1f5e32d1e16ad27dd96cb43b78d5d36fb4 (patch) | |
tree | ad0e188b62806ecc7d2d806f1f8958b0a1fd8bbf | |
parent | 5b5cf7f2629c3a6c78f6057ff1e8476ff001409f (diff) | |
download | debian-dedup-0e690a1f5e32d1e16ad27dd96cb43b78d5d36fb4.tar.gz |
generalize ImageHash to StoredHash
-rw-r--r-- | dedup/filters.py | 50 | ||||
-rw-r--r-- | dedup/hashing.py | 60 | ||||
-rw-r--r-- | dedup/image.py | 78 |
3 files changed, 141 insertions, 47 deletions
diff --git a/dedup/filters.py b/dedup/filters.py new file mode 100644 index 0000000..c5b3251 --- /dev/null +++ b/dedup/filters.py @@ -0,0 +1,50 @@ +import struct + +class PNGFilter: + """Skips non-critical chunks in a PNG file.""" + magic = b"\x89PNG\r\n\x1a\n" + def __init__(self): + self.inbuffer = b"" + self.critchunk = False + self.chunkleft = None + + def filter(self, data): + self.inbuffer += data + if self.chunkleft is None: + if len(self.inbuffer) < 8: + return b"" + if not self.inbuffer.startswith(self.magic): + raise ValueError("PNG file magic not found") + self.inbuffer = self.inbuffer[8:] + self.chunkleft = 0 + ret = b"" + while True: + if self.chunkleft == 0: + if len(self.inbuffer) < 8: + break + self.chunkleft, chunktype = struct.unpack(">I4s", + self.inbuffer[:8]) + self.chunkleft += 12 # len, type, crc + self.critchunk = chunktype[0].isupper() + if self.critchunk: + print("critical chunk %s %r" % (chunktype, self.inbuffer[8:16])) + n = min(self.chunkleft, len(self.inbuffer)) + if self.critchunk: + ret += self.inbuffer[:n] + self.inbuffer = self.inbuffer[n:] + self.chunkleft -= n + if self.chunkleft: + break + return ret + + def flush(self): + ret = self.inbuffer + self.inbuffer = b"" + return ret + + def copy(self): + new = PNGFilter() + new.inbuffer = self.inbuffer + new.critchunk = self.critchunk + new.chunkleft = self.chunkleft + return new diff --git a/dedup/hashing.py b/dedup/hashing.py index 1283c7e..0c786e1 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,3 +1,6 @@ +import io +import os + class HashBlacklist(object): """Turn a hashlib-like object into a hash that returns None for some blacklisted hashes instead of the real hash value. @@ -96,6 +99,63 @@ class SuppressingHash(object): return SuppressingHash(self.hashobj.copy(), self.exceptions) return SuppressingHash(None, self.exceptions) +class StoredHash(object): + """A hash that stores all the updates and then does all the work on the + hexdigest call.""" + + def __init__(self, digestfunc, acceptfunc=lambda _: True, + sizelimit=1024*1024*16): + """ + @type digestfunc: file-like -> str + @param digestfunc: should read the given file-like and return + the computed hash. The file-like can be assumed to be seekable. + @type acceptfunc: file-like -> bool or None + should read enough of file-like to determine whether the hash + is computable. To accept the hash, return True. To reject + producing a hash return False. To defer the decision until more + data is available return None. + @type sizelimit: int + @param sizelimit: when the content exceeds this size, reject it + """ + self.digestfunc = digestfunc + self.acceptfunc = acceptfunc + self.sizelimit = sizelimit + self.content = io.BytesIO() + self.accepted = False + + def update(self, data): + if self.content is None or not data: + return + self.content.seek(0, os.SEEK_END) + if self.content.tell() + len(data) > self.sizelimit: + self.content = None + return + self.content.write(data) + if not self.accepted: + self.content.seek(0, os.SEEK_SET) + ret = self.acceptfunc(self.content) + if ret is None: + return + if ret: + self.accepted = True + else: + self.content = None + + def hexdigest(self): + if not self.content or not self.accepted: + return None + self.content.seek(0, os.SEEK_SET) + return self.digestfunc(self.content) + + def copy(self): + new = StoredHash(self.digestfunc, self.acceptfunc, self.sizelimit) + if self.content: + new.content = io.BytesIO(self.content.getvalue()) + else: + new.content = None + new.accepted = self.accepted + return new + def hash_file(hashobj, filelike, blocksize=65536): """Feed the entire contents from the given filelike to the given hashobj. @param hashobj: hashlib-like object providing an update method diff --git a/dedup/image.py b/dedup/image.py index e05e7da..78f8bd9 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -1,9 +1,20 @@ -import io import struct import PIL.Image -class ImageHash(object): +from . import hashing + +def detectpng(content, maxpixels=1024 * 1024 * 32): + if len(content) < 33: + return None # defer decision + if not content.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): + return False + width, height = struct.unpack(">II", content[16:24]) + if width * height > maxpixels: + return False + return True + +class ImageHash(hashing.StoredHash): """A hash on the contents of an image. This disregards mode, depth and meta information. Note that due to limitations in PIL and the image format (interlacing) the full contents are stored and decoded in hexdigest.""" @@ -16,52 +27,25 @@ class ImageHash(object): @param hashobj: a hashlib-like object """ self.hashobj = hashobj - self.imagedetected = False - self.content = io.BytesIO() - - def update(self, data): - self.content.write(data) - if self.content.tell() > self.maxsize: - raise ValueError("maximum image size exceeded") - if self.imagedetected: - return - if self.content.tell() < 33: # header + IHDR - return - curvalue = self.content.getvalue() - if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): - width, height = struct.unpack(">II", curvalue[16:24]) - if width * height > self.maxpixels: - raise ValueError("maximum image pixels exceeded") - self.imagedetected = True - return - raise ValueError("not a png image") + hashing.StoredHash.__init__(self, self.computehash, self.detect, + self.maxsize) + self.hashobj = hashobj - def copy(self): - new = ImageHash() - new.hashobj = self.hashobj.copy() - new.imagedetected = self.imagedetected - new.content = io.BytesIO(self.content.getvalue()) - return new + def detect(self, bytesio): + return detectpng(bytesio.read(33), self.maxpixels) - def hexdigest(self): - if not self.imagedetected: - raise ValueError("not a png image") + def computehash(self, bytesio): hashobj = self.hashobj.copy() - pos = self.content.tell() - try: - self.content.seek(0) - img = PIL.Image.open(self.content) - width, height = img.size - pack = lambda elem: struct.pack("BBBB", *elem) - # special casing easy modes reduces memory usage - if img.mode == "L": - pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255) - elif img.mode == "RGB": - pack = lambda elem: struct.pack("BBBB", *(elem + (255,))) - elif img.mode != "RGBA": - img = img.convert("RGBA") - for elem in img.getdata(): - hashobj.update(pack(elem)) - finally: - self.content.seek(pos) + img = PIL.Image.open(self.content) + width, height = img.size + pack = lambda elem: struct.pack("BBBB", *elem) + # special casing easy modes reduces memory usage + if img.mode == "L": + pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255) + elif img.mode == "RGB": + pack = lambda elem: struct.pack("BBBB", *(elem + (255,))) + elif img.mode != "RGBA": + img = img.convert("RGBA") + for elem in img.getdata(): + hashobj.update(pack(elem)) return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height) |