summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-08-01 23:06:26 +0200
committerHelmut Grohne <helmut@subdivi.de>2013-08-01 23:06:26 +0200
commit2712edb550968ce7ec8cd9800241d7944666631a (patch)
treec213907b81aaad6cc7ab07fc7ea809f32fcb5fbe
parentd3f68ad766b1c33867c2c504b0f5e6d9bb7cbf03 (diff)
downloaddebian-dedup-2712edb550968ce7ec8cd9800241d7944666631a.tar.gz
support hashing gif images
* Rename "image_sha512" to "png_sha512". * dedup.image.ImageHash is now a base class for image hashes such as PNGHash and GIFHash. * Enable both hashes in importpkg. * Fix README. * Add new hash combinations to webapp. * Add "gif file not named *.gif" to issues in update_sharing. * Add redirect for "image_sha512" to webapp for backwards compatibility.
-rw-r--r--README2
-rw-r--r--dedup/image.py67
-rwxr-xr-ximportpkg.py17
-rw-r--r--schema.sql2
-rwxr-xr-xupdate_sharing.py3
-rwxr-xr-xwebapp.py10
6 files changed, 70 insertions, 31 deletions
diff --git a/README b/README
index a84807a..bf4da52 100644
--- a/README
+++ b/README
@@ -47,7 +47,7 @@ one copy in the archive.
Finding PNG images that do not carry a .png file extension.
- SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "image_sha512" AND lower(filename) NOT LIKE "%.png";
+ SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "png_sha512" AND lower(filename) NOT LIKE "%.png";
Finding .gz files which either are not gziped or contain errors.
diff --git a/dedup/image.py b/dedup/image.py
index 1148890..c1f2de0 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -4,9 +4,10 @@ import struct
import PIL.Image
class ImageHash(object):
- """A hash on the contents of an image. This disregards mode, depth and meta
- information. Note that due to limitations in PIL and the image format
- (interlacing) the full contents are stored and decoded in hexdigest."""
+ """A hash on the contents of an image datat type supported by PIL. This
+ disregards mode, depth and meta information. Note that due to limitations
+ in PIL and the image format (interlacing) the full contents are stored and
+ decoded in hexdigest."""
maxsize = 1024 * 1024 * 32
# max memory usage is about 5 * maxpixels in bytes
maxpixels = 1024 * 1024 * 32
@@ -19,33 +20,25 @@ class ImageHash(object):
self.imagedetected = False
self.content = io.BytesIO()
+ def detect(self):
+ raise NotImplementedError
+
def update(self, data):
self.content.write(data)
if self.content.tell() > self.maxsize:
raise ValueError("maximum image size exceeded")
- if self.imagedetected:
- return
- if self.content.tell() < 33: # header + IHDR
- return
- curvalue = self.content.getvalue()
- if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
- width, height = struct.unpack(">II", curvalue[16:24])
- if width * height > self.maxpixels:
- raise ValueError("maximum image pixels exceeded")
- self.imagedetected = True
- return
- raise ValueError("not a png image")
+ if not self.imagedetected:
+ self.imagedetected = self.detect()
def copy(self):
- new = ImageHash()
- new.hashobj = self.hashobj.copy()
+ new = self.__class__(self.hashobj.copy())
new.imagedetected = self.imagedetected
new.content = io.BytesIO(self.content.getvalue())
return new
def hexdigest(self):
if not self.imagedetected:
- raise ValueError("not a png image")
+ raise ValueError("not a image")
hashobj = self.hashobj.copy()
pos = self.content.tell()
try:
@@ -53,7 +46,7 @@ class ImageHash(object):
try:
img = PIL.Image.open(self.content)
except IOError:
- raise ValueError("broken png header")
+ raise ValueError("broken header")
width, height = img.size
pack = lambda elem: struct.pack("BBBB", *elem)
# special casing easy modes reduces memory usage
@@ -64,13 +57,43 @@ class ImageHash(object):
elif img.mode != "RGBA":
try:
img = img.convert("RGBA")
- except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
- raise ValueError("error reading png image")
+ except (SyntaxError, IndexError, IOError):
+ # crazy stuff from PIL
+ raise ValueError("error reading image")
try:
for elem in img.getdata():
hashobj.update(pack(elem))
except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
- raise ValueError("error reading png image")
+ raise ValueError("error reading image")
finally:
self.content.seek(pos)
return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)
+
+
+class PNGHash(ImageHash):
+ """A hash on the contents of a PNG image."""
+
+ def detect(self):
+ if self.content.tell() < 33: # header + IHDR
+ return False
+ curvalue = self.content.getvalue()
+ if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
+ width, height = struct.unpack(">II", curvalue[16:24])
+ if width * height > self.maxpixels:
+ raise ValueError("maximum image pixels exceeded")
+ return True
+ raise ValueError("not a png image")
+
+class GIFHash(ImageHash):
+ """A hash on the contents of the first frame of a GIF image."""
+
+ def detect(self):
+ if self.content.tell() < 10: # magic + logical dimension
+ return False
+ curvalue = self.content.getvalue()
+ if curvalue.startswith((b"GIF87a", "GIF89a")):
+ width, height = struct.unpack("<HH", curvalue[6:10])
+ if width * height > self.maxpixels:
+ raise ValueError("maximum image pixels exceeded")
+ return True
+ raise ValueError("not a png image")
diff --git a/importpkg.py b/importpkg.py
index 02d4936..182ca01 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -19,7 +19,7 @@ from dedup.arreader import ArReader
from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \
HashedStream, hash_file
from dedup.compression import GzipDecompressor, DecompressedStream
-from dedup.image import ImageHash
+from dedup.image import GIFHash, PNGHash
class MultiHash(object):
def __init__(self, *hashes):
@@ -44,17 +44,24 @@ def gziphash():
hashobj.name = "gzip_sha512"
return HashBlacklist(hashobj, boring_sha512_hashes)
-def imagehash():
- hashobj = ImageHash(hashlib.sha512())
+def pnghash():
+ hashobj = PNGHash(hashlib.sha512())
hashobj = SuppressingHash(hashobj, (ValueError,))
- hashobj.name = "image_sha512"
+ hashobj.name = "png_sha512"
+ return hashobj
+
+def gifhash():
+ hashobj = GIFHash(hashlib.sha512())
+ hashobj = SuppressingHash(hashobj, (ValueError,))
+ hashobj.name = "gif_sha512"
return hashobj
def get_hashes(tar):
for elem in tar:
if not elem.isreg(): # excludes hard links as well
continue
- hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
+ hasher = MultiHash(sha512_nontrivial(), gziphash(), pnghash(),
+ gifhash())
hasher = hash_file(hasher, tar.extractfile(elem))
hashes = {}
for hashobj in hasher.hashes:
diff --git a/schema.sql b/schema.sql
index 13a65aa..ddc6ccd 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,7 +1,7 @@
CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT);
CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL);
-INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("image_sha512");
+INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("png_sha512"), ("gif_sha512");
CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id));
CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
CREATE INDEX content_package_size_index ON content (pid, size);
diff --git a/update_sharing.py b/update_sharing.py
index 910662e..5ec6c7b 100755
--- a/update_sharing.py
+++ b/update_sharing.py
@@ -56,7 +56,8 @@ def main():
[(row[1],) for row in rows])
process_pkgdict(cur, pkgdict)
cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');")
- cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'image_sha512' AND lower(filename) NOT LIKE '%.png';")
+ cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';")
+ cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';")
db.commit()
if __name__ == "__main__":
diff --git a/webapp.py b/webapp.py
index 6c6f5b4..260268a 100755
--- a/webapp.py
+++ b/webapp.py
@@ -14,7 +14,10 @@ from dedup.utils import fetchiter
hash_functions = [
("sha512", "sha512"),
- ("image_sha512", "image_sha512"),
+ ("png_sha512", "png_sha512"),
+ ("png_sha512", "gif_sha512"),
+ ("gif_sha512", "png_sha512"),
+ ("gif_sha512", "gif_sha512"),
("gzip_sha512", "gzip_sha512"),
("sha512", "gzip_sha512"),
("gzip_sha512", "sha512")]
@@ -87,6 +90,11 @@ class Application(object):
elif endpoint == "detail":
return self.show_detail(args["package1"], args["package2"])
elif endpoint == "hash":
+ if args["function"] == "image_sha512":
+ # backwards compatibility
+ raise RequestRedirect("%s/hash/png_sha512/%s" %
+ (request.environ["SCRIPT_NAME"],
+ args["hashvalue"]))
return self.show_hash(args["function"], args["hashvalue"])
elif endpoint == "index":
if not request.environ["PATH_INFO"]: