From f09f59b271a1e23eda162fd357814ca560f72a34 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 24 Feb 2013 01:03:30 +0100 Subject: hash image contents --- README | 2 +- dedup/image.py | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ importpkg.py | 9 +++++++- 3 files changed, 76 insertions(+), 2 deletions(-) create mode 100644 dedup/image.py diff --git a/README b/README index 7bc4517..aff9868 100644 --- a/README +++ b/README @@ -1,7 +1,7 @@ Required packages ----------------- -aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 +aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging Create a database ----------------- diff --git a/dedup/image.py b/dedup/image.py new file mode 100644 index 0000000..e05e7da --- /dev/null +++ b/dedup/image.py @@ -0,0 +1,67 @@ +import io +import struct + +import PIL.Image + +class ImageHash(object): + """A hash on the contents of an image. This disregards mode, depth and meta + information. Note that due to limitations in PIL and the image format + (interlacing) the full contents are stored and decoded in hexdigest.""" + maxsize = 1024 * 1024 * 32 + # max memory usage is about 5 * maxpixels in bytes + maxpixels = 1024 * 1024 * 32 + + def __init__(self, hashobj): + """ + @param hashobj: a hashlib-like object + """ + self.hashobj = hashobj + self.imagedetected = False + self.content = io.BytesIO() + + def update(self, data): + self.content.write(data) + if self.content.tell() > self.maxsize: + raise ValueError("maximum image size exceeded") + if self.imagedetected: + return + if self.content.tell() < 33: # header + IHDR + return + curvalue = self.content.getvalue() + if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): + width, height = struct.unpack(">II", curvalue[16:24]) + if width * height > self.maxpixels: + raise ValueError("maximum image pixels exceeded") + self.imagedetected = True + return + raise ValueError("not a png image") + + def copy(self): + new = ImageHash() + new.hashobj = self.hashobj.copy() + new.imagedetected = self.imagedetected + new.content = io.BytesIO(self.content.getvalue()) + return new + + def hexdigest(self): + if not self.imagedetected: + raise ValueError("not a png image") + hashobj = self.hashobj.copy() + pos = self.content.tell() + try: + self.content.seek(0) + img = PIL.Image.open(self.content) + width, height = img.size + pack = lambda elem: struct.pack("BBBB", *elem) + # special casing easy modes reduces memory usage + if img.mode == "L": + pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255) + elif img.mode == "RGB": + pack = lambda elem: struct.pack("BBBB", *(elem + (255,))) + elif img.mode != "RGBA": + img = img.convert("RGBA") + for elem in img.getdata(): + hashobj.update(pack(elem)) + finally: + self.content.seek(pos) + return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height) diff --git a/importpkg.py b/importpkg.py index 5901b57..d626fba 100755 --- a/importpkg.py +++ b/importpkg.py @@ -20,6 +20,7 @@ import lzma from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file from dedup.compression import GzipDecompressor, DecompressedStream +from dedup.image import ImageHash class ArReader(object): global_magic = b"!\n" @@ -96,11 +97,17 @@ def gziphash(): hashobj.name = "gzip_sha512" return HashBlacklist(hashobj, boring_sha512_hashes) +def imagehash(): + hashobj = ImageHash(hashlib.sha512()) + hashobj = SuppressingHash(hashobj, (ValueError,)) + hashobj.name = "image_sha512" + return hashobj + def get_hashes(tar): for elem in tar: if not elem.isreg(): # excludes hard links as well continue - hasher = MultiHash(sha512_nontrivial(), gziphash()) + hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash()) hasher = hash_file(hasher, tar.extractfile(elem)) for hashobj in hasher.hashes: hashvalue = hashobj.hexdigest() -- cgit v1.2.3