5 files changed, 126 insertions, 18 deletions
diff --git a/README b/README
index 3f4af93..aff9868 100644
--- a/README
+++ b/README
@@ -1,7 +1,7 @@
 Required packages
 -----------------
 
-aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3
+aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging
 
 Create a database
 -----------------
@@ -12,7 +12,7 @@ Import packages
 ---------------
 Import individual packages by feeding them to importpkg.py:
 
-    ls -t /var/cache/apt/archives/*.deb | while read f; echo $f; ./importpkg.py < $f || break; done
+    ls -t /var/cache/apt/archives/*.deb | while read f; do echo $f; ./importpkg.py < $f || break; done
 
 Import a full mirror::
 
diff --git a/autoimport.py b/autoimport.py
index 6ce7146..a7ea5ff 100755
--- a/autoimport.py
+++ b/autoimport.py
@@ -8,6 +8,7 @@ import sys
 import urllib
 
 from debian import deb822
+from debian.debian_support import version_compare
 
 def main():
     urlbase = sys.argv[1]
@@ -22,7 +23,8 @@ def main():
     for pkg in deb822.Packages.iter_paragraphs(io.BytesIO(pkglist)):
         name = pkg["Package"]
         distpkgs.add(name)
-        if pkg["Version"] == knownpkgs.get(name, ()):
+        if name in knownpkgs and \
+                version_compare(pkg["Version"], knownpkgs[name]) <= 0:
             continue
         pkgurl = "%s/%s" % (urlbase, pkg["Filename"])
         print("importing %s" % name)
@@ -33,9 +35,15 @@ def main():
         if dl.wait():
             print("curl failed")
     
+    delpkgs = set(knownpkgs) - distpkgs
+    print("clearing packages %s" % " ".join(delpkgs))
     cur.execute("PRAGMA foreign_keys=1;")
+    cur.executemany("DELETE FROM content WHERE package = ?;",
+                    ((pkg,) for pkg in delpkgs))
+    cur.executemany("DELETE FROM dependency WHERE package = ?;",
+                    ((pkg,) for pkg in delpkgs))
     cur.executemany("DELETE FROM package WHERE package = ?;",
-                    ((pkg,) for pkg in set(knownpkgs) - distpkgs))
+                    ((pkg,) for pkg in delpkgs))
     db.commit()
 
 if __name__ == "__main__":
diff --git a/dedup/image.py b/dedup/image.py
new file mode 100644
index 0000000..e05e7da
--- /dev/null
+++ b/dedup/image.py
@@ -0,0 +1,67 @@
+import io
+import struct
+
+import PIL.Image
+
+class ImageHash(object):
+    """A hash on the contents of an image. This disregards mode, depth and meta
+    information. Note that due to limitations in PIL and the image format
+    (interlacing) the full contents are stored and decoded in hexdigest."""
+    maxsize = 1024 * 1024 * 32
+    # max memory usage is about 5 * maxpixels in bytes
+    maxpixels = 1024 * 1024 * 32
+
+    def __init__(self, hashobj):
+        """
+        @param hashobj: a hashlib-like object
+        """
+        self.hashobj = hashobj
+        self.imagedetected = False
+        self.content = io.BytesIO()
+
+    def update(self, data):
+        self.content.write(data)
+        if self.content.tell() > self.maxsize:
+            raise ValueError("maximum image size exceeded")
+        if self.imagedetected:
+            return
+        if self.content.tell() < 33: # header + IHDR
+            return
+        curvalue = self.content.getvalue()
+        if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"):
+            width, height = struct.unpack(">II", curvalue[16:24])
+            if width * height > self.maxpixels:
+                raise ValueError("maximum image pixels exceeded")
+            self.imagedetected = True
+            return
+        raise ValueError("not a png image")
+
+    def copy(self):
+        new = ImageHash()
+        new.hashobj = self.hashobj.copy()
+        new.imagedetected = self.imagedetected
+        new.content = io.BytesIO(self.content.getvalue())
+        return new
+
+    def hexdigest(self):
+        if not self.imagedetected:
+            raise ValueError("not a png image")
+        hashobj = self.hashobj.copy()
+        pos = self.content.tell()
+        try:
+            self.content.seek(0)
+            img = PIL.Image.open(self.content)
+            width, height = img.size
+            pack = lambda elem: struct.pack("BBBB", *elem)
+            # special casing easy modes reduces memory usage
+            if img.mode == "L":
+                pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255)
+            elif img.mode == "RGB":
+                pack = lambda elem: struct.pack("BBBB", *(elem + (255,)))
+            elif img.mode != "RGBA":
+                img = img.convert("RGBA")
+            for elem in img.getdata():
+                hashobj.update(pack(elem))
+        finally:
+            self.content.seek(pos)
+        return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)
diff --git a/importpkg.py b/importpkg.py
index 89020b9..d626fba 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -20,6 +20,7 @@ import lzma
 
 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
 from dedup.compression import GzipDecompressor, DecompressedStream
+from dedup.image import ImageHash
 
 class ArReader(object):
     global_magic = b"!<arch>\n"
@@ -96,11 +97,17 @@ def gziphash():
     hashobj.name = "gzip_sha512"
     return HashBlacklist(hashobj, boring_sha512_hashes)
 
+def imagehash():
+    hashobj = ImageHash(hashlib.sha512())
+    hashobj = SuppressingHash(hashobj, (ValueError,))
+    hashobj.name = "image_sha512"
+    return hashobj
+
 def get_hashes(tar):
     for elem in tar:
         if not elem.isreg(): # excludes hard links as well
             continue
-        hasher = MultiHash(sha512_nontrivial(), gziphash())
+        hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash())
         hasher = hash_file(hasher, tar.extractfile(elem))
         for hashobj in hasher.hashes:
             hashvalue = hashobj.hexdigest()
@@ -167,8 +174,13 @@ def process_package(db, filelike):
         if state != "control_file":
             raise ValueError("missing control file")
         for name, size, function, hexhash in get_hashes(tf):
+            try:
+                name = name.decode("utf8")
+            except UnicodeDecodeError:
+                print("warning: skipping filename with encoding error")
+                continue # skip files with non-utf8 encoding for now
             cur.execute("INSERT INTO content (package, filename, size, function, hash) VALUES (?, ?, ?, ?, ?);",
-                        (package, name.decode("utf8"), size, function, hexhash))
+                        (package, name, size, function, hexhash))
         db.commit()
         return
     raise ValueError("data.tar not found")
diff --git a/webapp.py b/webapp.py
index 06aa5d6..a215dd5 100755
--- a/webapp.py
+++ b/webapp.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python
 
+import datetime
 import sqlite3
 from wsgiref.simple_server import make_server
 
@@ -10,6 +11,7 @@ from werkzeug.wrappers import Request, Response
 
 hash_functions = [
         ("sha512", "sha512"),
+        ("image_sha512", "image_sha512"),
         ("gzip_sha512", "gzip_sha512"),
         ("sha512", "gzip_sha512"),
         ("gzip_sha512", "sha512")]
@@ -104,6 +106,29 @@ index_template = jinjaenv.from_string(
 </ul>
 {% endblock %}""")
 
+def fetchiter(cursor):
+    rows = cursor.fetchmany()
+    while rows:
+        for row in rows:
+            yield row
+        rows = cursor.fetchmany()
+
+def encode_and_buffer(iterator):
+    buff = b""
+    for elem in iterator:
+        buff += elem.encode("utf8")
+        if len(buff) >= 2048:
+            yield buff
+            buff = b""
+    if buff:
+        yield buff
+
+def html_response(unicode_iterator, max_age=24 * 60 * 60):
+    resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
+    resp.cache_control.max_age = max_age
+    resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
+    return resp
+
 class Application(object):
     def __init__(self):
         self.db = sqlite3.connect("test.sqlite3")
@@ -129,8 +154,7 @@ class Application(object):
             elif endpoint == "index":
                 if not request.environ["PATH_INFO"]:
                     raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/")
-                return Response(index_template.render().encode("utf8"),
-                                content_type="text/html")
+                return html_response(index_template.stream())
             raise NotFound()
         except HTTPException as e:
             return e
@@ -154,7 +178,7 @@ class Application(object):
     def get_dependencies(self, package):
         self.cur.execute("SELECT required FROM dependency WHERE package = ?;",
                          (package,))
-        return set(row[0] for row in self.cur.fetchall())
+        return set(row[0] for row in fetchiter(self.cur))
 
     def show_package(self, package):
         params = self.get_details(package)
@@ -165,7 +189,7 @@ class Application(object):
             self.cur.execute("SELECT a.filename, a.hash, a.size, b.package FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND a.function = ? AND b.function = ? AND (a.filename != b.filename OR b.package != ?);",
                              (package, func1, func2, package))
             sharing = dict()
-            for afile, hashval, size, bpkg in self.cur.fetchall():
+            for afile, hashval, size, bpkg in fetchiter(self.cur):
                 hashdict = sharing.setdefault(bpkg, dict())
                 fileset = hashdict.setdefault(hashval, (size, set()))[1]
                 fileset.add(afile)
@@ -186,8 +210,7 @@ class Application(object):
                     curstats.append(dict(package=pkg, duplicate=duplicate, savable=savable))
 
         params["shared"] = sharedstats
-        return Response(package_template.render(**params).encode("utf8"),
-                        content_type="text/html")
+        return html_response(package_template.render(params))
 
     def show_detail(self, package1, package2):
         if package1 == package2:
@@ -203,7 +226,7 @@ class Application(object):
                              (package1, package2))
 
         shared = dict()
-        for filename1, size1, func1, filename2, size2, func2, hashvalue in self.cur.fetchall():
+        for filename1, size1, func1, filename2, size2, func2, hashvalue in fetchiter(self.cur):
             funccomb = (func1, func2)
             if funccomb not in hash_functions:
                 continue
@@ -218,21 +241,19 @@ class Application(object):
             details1=details1,
             details2=details2,
             shared=shared)
-        return Response(detail_template.render(**params).encode("utf8"),
-                        content_type="text/html")
+        return html_response(detail_template.render(params))
 
     def show_hash(self, function, hashvalue):
         self.cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;",
                          (hashvalue,))
         entries = [dict(package=package, filename=filename, size=size,
                         function=otherfunc)
-                   for package, filename, size, otherfunc in self.cur.fetchall()
+                   for package, filename, size, otherfunc in fetchiter(self.cur)
                    if (function, otherfunc) in hash_functions]
         if not entries:
             raise NotFound()
         params = dict(function=function, hashvalue=hashvalue, entries=entries)
-        return Response(hash_template.render(**params).encode("utf8"),
-                        content_type="text/html")
+        return html_response(hash_template.render(params))
 
 def main():
     app = Application()