From 2712edb550968ce7ec8cd9800241d7944666631a Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 1 Aug 2013 23:06:26 +0200 Subject: support hashing gif images * Rename "image_sha512" to "png_sha512". * dedup.image.ImageHash is now a base class for image hashes such as PNGHash and GIFHash. * Enable both hashes in importpkg. * Fix README. * Add new hash combinations to webapp. * Add "gif file not named *.gif" to issues in update_sharing. * Add redirect for "image_sha512" to webapp for backwards compatibility. --- README | 2 +- dedup/image.py | 67 +++++++++++++++++++++++++++++++++++++------------------ importpkg.py | 17 +++++++++----- schema.sql | 2 +- update_sharing.py | 3 ++- webapp.py | 10 ++++++++- 6 files changed, 70 insertions(+), 31 deletions(-) diff --git a/README b/README index a84807a..bf4da52 100644 --- a/README +++ b/README @@ -47,7 +47,7 @@ one copy in the archive. Finding PNG images that do not carry a .png file extension. - SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "image_sha512" AND lower(filename) NOT LIKE "%.png"; + SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "png_sha512" AND lower(filename) NOT LIKE "%.png"; Finding .gz files which either are not gziped or contain errors. diff --git a/dedup/image.py b/dedup/image.py index 1148890..c1f2de0 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -4,9 +4,10 @@ import struct import PIL.Image class ImageHash(object): - """A hash on the contents of an image. This disregards mode, depth and meta - information. Note that due to limitations in PIL and the image format - (interlacing) the full contents are stored and decoded in hexdigest.""" + """A hash on the contents of an image datat type supported by PIL. This + disregards mode, depth and meta information. Note that due to limitations + in PIL and the image format (interlacing) the full contents are stored and + decoded in hexdigest.""" maxsize = 1024 * 1024 * 32 # max memory usage is about 5 * maxpixels in bytes maxpixels = 1024 * 1024 * 32 @@ -19,33 +20,25 @@ class ImageHash(object): self.imagedetected = False self.content = io.BytesIO() + def detect(self): + raise NotImplementedError + def update(self, data): self.content.write(data) if self.content.tell() > self.maxsize: raise ValueError("maximum image size exceeded") - if self.imagedetected: - return - if self.content.tell() < 33: # header + IHDR - return - curvalue = self.content.getvalue() - if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): - width, height = struct.unpack(">II", curvalue[16:24]) - if width * height > self.maxpixels: - raise ValueError("maximum image pixels exceeded") - self.imagedetected = True - return - raise ValueError("not a png image") + if not self.imagedetected: + self.imagedetected = self.detect() def copy(self): - new = ImageHash() - new.hashobj = self.hashobj.copy() + new = self.__class__(self.hashobj.copy()) new.imagedetected = self.imagedetected new.content = io.BytesIO(self.content.getvalue()) return new def hexdigest(self): if not self.imagedetected: - raise ValueError("not a png image") + raise ValueError("not a image") hashobj = self.hashobj.copy() pos = self.content.tell() try: @@ -53,7 +46,7 @@ class ImageHash(object): try: img = PIL.Image.open(self.content) except IOError: - raise ValueError("broken png header") + raise ValueError("broken header") width, height = img.size pack = lambda elem: struct.pack("BBBB", *elem) # special casing easy modes reduces memory usage @@ -64,13 +57,43 @@ class ImageHash(object): elif img.mode != "RGBA": try: img = img.convert("RGBA") - except (SyntaxError, IndexError, IOError): # crazy stuff from PIL - raise ValueError("error reading png image") + except (SyntaxError, IndexError, IOError): + # crazy stuff from PIL + raise ValueError("error reading image") try: for elem in img.getdata(): hashobj.update(pack(elem)) except (SyntaxError, IndexError, IOError): # crazy stuff from PIL - raise ValueError("error reading png image") + raise ValueError("error reading image") finally: self.content.seek(pos) return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height) + + +class PNGHash(ImageHash): + """A hash on the contents of a PNG image.""" + + def detect(self): + if self.content.tell() < 33: # header + IHDR + return False + curvalue = self.content.getvalue() + if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): + width, height = struct.unpack(">II", curvalue[16:24]) + if width * height > self.maxpixels: + raise ValueError("maximum image pixels exceeded") + return True + raise ValueError("not a png image") + +class GIFHash(ImageHash): + """A hash on the contents of the first frame of a GIF image.""" + + def detect(self): + if self.content.tell() < 10: # magic + logical dimension + return False + curvalue = self.content.getvalue() + if curvalue.startswith((b"GIF87a", "GIF89a")): + width, height = struct.unpack(" self.maxpixels: + raise ValueError("maximum image pixels exceeded") + return True + raise ValueError("not a png image") diff --git a/importpkg.py b/importpkg.py index 02d4936..182ca01 100755 --- a/importpkg.py +++ b/importpkg.py @@ -19,7 +19,7 @@ from dedup.arreader import ArReader from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ HashedStream, hash_file from dedup.compression import GzipDecompressor, DecompressedStream -from dedup.image import ImageHash +from dedup.image import GIFHash, PNGHash class MultiHash(object): def __init__(self, *hashes): @@ -44,17 +44,24 @@ def gziphash(): hashobj.name = "gzip_sha512" return HashBlacklist(hashobj, boring_sha512_hashes) -def imagehash(): - hashobj = ImageHash(hashlib.sha512()) +def pnghash(): + hashobj = PNGHash(hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError,)) - hashobj.name = "image_sha512" + hashobj.name = "png_sha512" + return hashobj + +def gifhash(): + hashobj = GIFHash(hashlib.sha512()) + hashobj = SuppressingHash(hashobj, (ValueError,)) + hashobj.name = "gif_sha512" return hashobj def get_hashes(tar): for elem in tar: if not elem.isreg(): # excludes hard links as well continue - hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash()) + hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(), + gifhash()) hasher = hash_file(hasher, tar.extractfile(elem)) hashes = {} for hashobj in hasher.hashes: diff --git a/schema.sql b/schema.sql index 13a65aa..ddc6ccd 100644 --- a/schema.sql +++ b/schema.sql @@ -1,7 +1,7 @@ CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT); CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL); -INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("image_sha512"); +INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("png_sha512"), ("gif_sha512"); CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id)); CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); CREATE INDEX content_package_size_index ON content (pid, size); diff --git a/update_sharing.py b/update_sharing.py index 910662e..5ec6c7b 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -56,7 +56,8 @@ def main(): [(row[1],) for row in rows]) process_pkgdict(cur, pkgdict) cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');") - cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'image_sha512' AND lower(filename) NOT LIKE '%.png';") + cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';") + cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';") db.commit() if __name__ == "__main__": diff --git a/webapp.py b/webapp.py index 6c6f5b4..260268a 100755 --- a/webapp.py +++ b/webapp.py @@ -14,7 +14,10 @@ from dedup.utils import fetchiter hash_functions = [ ("sha512", "sha512"), - ("image_sha512", "image_sha512"), + ("png_sha512", "png_sha512"), + ("png_sha512", "gif_sha512"), + ("gif_sha512", "png_sha512"), + ("gif_sha512", "gif_sha512"), ("gzip_sha512", "gzip_sha512"), ("sha512", "gzip_sha512"), ("gzip_sha512", "sha512")] @@ -87,6 +90,11 @@ class Application(object): elif endpoint == "detail": return self.show_detail(args["package1"], args["package2"]) elif endpoint == "hash": + if args["function"] == "image_sha512": + # backwards compatibility + raise RequestRedirect("%s/hash/png_sha512/%s" % + (request.environ["SCRIPT_NAME"], + args["hashvalue"])) return self.show_hash(args["function"], args["hashvalue"]) elif endpoint == "index": if not request.environ["PATH_INFO"]: -- cgit v1.2.3