diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-08-01 23:06:26 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-08-01 23:06:26 +0200 |
commit | 2712edb550968ce7ec8cd9800241d7944666631a (patch) | |
tree | c213907b81aaad6cc7ab07fc7ea809f32fcb5fbe | |
parent | d3f68ad766b1c33867c2c504b0f5e6d9bb7cbf03 (diff) | |
download | debian-dedup-2712edb550968ce7ec8cd9800241d7944666631a.tar.gz |
support hashing gif images
* Rename "image_sha512" to "png_sha512".
* dedup.image.ImageHash is now a base class for image hashes such as
PNGHash and GIFHash.
* Enable both hashes in importpkg.
* Fix README.
* Add new hash combinations to webapp.
* Add "gif file not named *.gif" to issues in update_sharing.
* Add redirect for "image_sha512" to webapp for backwards
compatibility.
-rw-r--r-- | README | 2 | ||||
-rw-r--r-- | dedup/image.py | 67 | ||||
-rwxr-xr-x | importpkg.py | 17 | ||||
-rw-r--r-- | schema.sql | 2 | ||||
-rwxr-xr-x | update_sharing.py | 3 | ||||
-rwxr-xr-x | webapp.py | 10 |
6 files changed, 70 insertions, 31 deletions
@@ -47,7 +47,7 @@ one copy in the archive. Finding PNG images that do not carry a .png file extension. - SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "image_sha512" AND lower(filename) NOT LIKE "%.png"; + SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "png_sha512" AND lower(filename) NOT LIKE "%.png"; Finding .gz files which either are not gziped or contain errors. diff --git a/dedup/image.py b/dedup/image.py index 1148890..c1f2de0 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -4,9 +4,10 @@ import struct import PIL.Image class ImageHash(object): - """A hash on the contents of an image. This disregards mode, depth and meta - information. Note that due to limitations in PIL and the image format - (interlacing) the full contents are stored and decoded in hexdigest.""" + """A hash on the contents of an image datat type supported by PIL. This + disregards mode, depth and meta information. Note that due to limitations + in PIL and the image format (interlacing) the full contents are stored and + decoded in hexdigest.""" maxsize = 1024 * 1024 * 32 # max memory usage is about 5 * maxpixels in bytes maxpixels = 1024 * 1024 * 32 @@ -19,33 +20,25 @@ class ImageHash(object): self.imagedetected = False self.content = io.BytesIO() + def detect(self): + raise NotImplementedError + def update(self, data): self.content.write(data) if self.content.tell() > self.maxsize: raise ValueError("maximum image size exceeded") - if self.imagedetected: - return - if self.content.tell() < 33: # header + IHDR - return - curvalue = self.content.getvalue() - if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): - width, height = struct.unpack(">II", curvalue[16:24]) - if width * height > self.maxpixels: - raise ValueError("maximum image pixels exceeded") - self.imagedetected = True - return - raise ValueError("not a png image") + if not self.imagedetected: + self.imagedetected = self.detect() def copy(self): - new = ImageHash() - new.hashobj = self.hashobj.copy() + new = self.__class__(self.hashobj.copy()) new.imagedetected = self.imagedetected new.content = io.BytesIO(self.content.getvalue()) return new def hexdigest(self): if not self.imagedetected: - raise ValueError("not a png image") + raise ValueError("not a image") hashobj = self.hashobj.copy() pos = self.content.tell() try: @@ -53,7 +46,7 @@ class ImageHash(object): try: img = PIL.Image.open(self.content) except IOError: - raise ValueError("broken png header") + raise ValueError("broken header") width, height = img.size pack = lambda elem: struct.pack("BBBB", *elem) # special casing easy modes reduces memory usage @@ -64,13 +57,43 @@ class ImageHash(object): elif img.mode != "RGBA": try: img = img.convert("RGBA") - except (SyntaxError, IndexError, IOError): # crazy stuff from PIL - raise ValueError("error reading png image") + except (SyntaxError, IndexError, IOError): + # crazy stuff from PIL + raise ValueError("error reading image") try: for elem in img.getdata(): hashobj.update(pack(elem)) except (SyntaxError, IndexError, IOError): # crazy stuff from PIL - raise ValueError("error reading png image") + raise ValueError("error reading image") finally: self.content.seek(pos) return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height) + + +class PNGHash(ImageHash): + """A hash on the contents of a PNG image.""" + + def detect(self): + if self.content.tell() < 33: # header + IHDR + return False + curvalue = self.content.getvalue() + if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): + width, height = struct.unpack(">II", curvalue[16:24]) + if width * height > self.maxpixels: + raise ValueError("maximum image pixels exceeded") + return True + raise ValueError("not a png image") + +class GIFHash(ImageHash): + """A hash on the contents of the first frame of a GIF image.""" + + def detect(self): + if self.content.tell() < 10: # magic + logical dimension + return False + curvalue = self.content.getvalue() + if curvalue.startswith((b"GIF87a", "GIF89a")): + width, height = struct.unpack("<HH", curvalue[6:10]) + if width * height > self.maxpixels: + raise ValueError("maximum image pixels exceeded") + return True + raise ValueError("not a png image") diff --git a/importpkg.py b/importpkg.py index 02d4936..182ca01 100755 --- a/importpkg.py +++ b/importpkg.py @@ -19,7 +19,7 @@ from dedup.arreader import ArReader from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ HashedStream, hash_file from dedup.compression import GzipDecompressor, DecompressedStream -from dedup.image import ImageHash +from dedup.image import GIFHash, PNGHash class MultiHash(object): def __init__(self, *hashes): @@ -44,17 +44,24 @@ def gziphash(): hashobj.name = "gzip_sha512" return HashBlacklist(hashobj, boring_sha512_hashes) -def imagehash(): - hashobj = ImageHash(hashlib.sha512()) +def pnghash(): + hashobj = PNGHash(hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError,)) - hashobj.name = "image_sha512" + hashobj.name = "png_sha512" + return hashobj + +def gifhash(): + hashobj = GIFHash(hashlib.sha512()) + hashobj = SuppressingHash(hashobj, (ValueError,)) + hashobj.name = "gif_sha512" return hashobj def get_hashes(tar): for elem in tar: if not elem.isreg(): # excludes hard links as well continue - hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash()) + hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(), + gifhash()) hasher = hash_file(hasher, tar.extractfile(elem)) hashes = {} for hashobj in hasher.hashes: @@ -1,7 +1,7 @@ CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT); CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL); -INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("image_sha512"); +INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("png_sha512"), ("gif_sha512"); CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id)); CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); CREATE INDEX content_package_size_index ON content (pid, size); diff --git a/update_sharing.py b/update_sharing.py index 910662e..5ec6c7b 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -56,7 +56,8 @@ def main(): [(row[1],) for row in rows]) process_pkgdict(cur, pkgdict) cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');") - cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'image_sha512' AND lower(filename) NOT LIKE '%.png';") + cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';") + cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';") db.commit() if __name__ == "__main__": @@ -14,7 +14,10 @@ from dedup.utils import fetchiter hash_functions = [ ("sha512", "sha512"), - ("image_sha512", "image_sha512"), + ("png_sha512", "png_sha512"), + ("png_sha512", "gif_sha512"), + ("gif_sha512", "png_sha512"), + ("gif_sha512", "gif_sha512"), ("gzip_sha512", "gzip_sha512"), ("sha512", "gzip_sha512"), ("gzip_sha512", "sha512")] @@ -87,6 +90,11 @@ class Application(object): elif endpoint == "detail": return self.show_detail(args["package1"], args["package2"]) elif endpoint == "hash": + if args["function"] == "image_sha512": + # backwards compatibility + raise RequestRedirect("%s/hash/png_sha512/%s" % + (request.environ["SCRIPT_NAME"], + args["hashvalue"])) return self.show_hash(args["function"], args["hashvalue"]) elif endpoint == "index": if not request.environ["PATH_INFO"]: |