4 files changed, 90 insertions, 58 deletions
diff --git a/dedup/arreader.py b/dedup/arreader.py
new file mode 100644
index 0000000..d74ae37
--- /dev/null
+++ b/dedup/arreader.py
@@ -0,0 +1,79 @@
+import struct
+
+class ArReader(object):
+    """Streaming AR file reader. After constructing an object, you usually
+    call read_magic once. Then you call read_entry in a loop and use the
+    ArReader object as file-like only providing read() to read the respective
+    file contents until you get EOFError from read_entry.
+    """
+    global_magic = b"!<arch>\n"
+    file_magic = b"`\n"
+
+    def __init__(self, fileobj):
+        """
+        @param fileobj: a file-like object providing nothing but read(length)
+        """
+        self.fileobj = fileobj
+        self.remaining = None
+        self.padding = 0
+
+    def read_magic(self):
+        """Consume the AR magic marker at the beginning of an AR file. You
+        must not call any other method before calling this method.
+        @raises ValueError: if the magic is not found
+        """
+        data = self.fileobj.read(len(self.global_magic))
+        if data != self.global_magic:
+            raise ValueError("ar global header not found")
+        self.remaining = 0
+
+    def read_entry(self):
+        """Read the next file header, return the filename and record the
+        length of the next file, so that the read method can be used to
+        exhaustively read the current file.
+        @rtype: bytes
+        @returns: the name of the next file
+        @raises ValueError: if the data format is wrong
+        @raises EOFError: when the end f the stream is reached
+        """
+        self.skip_current_entry()
+        if self.padding:
+            if self.fileobj.read(1) != b'\n':
+                raise ValueError("missing ar padding")
+            self.padding = 0
+        file_header = self.fileobj.read(60)
+        if not file_header:
+            raise EOFError("end of archive found")
+        parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header)
+        parts = [p.rstrip(b" ") for p in parts]
+        if parts.pop() != self.file_magic:
+            raise ValueError("ar file header not found")
+        self.remaining = int(parts[5])
+        self.padding = self.remaining % 2
+        return parts[0] # name
+
+    def skip_current_entry(self):
+        """Skip the remainder of the current file. This method must not be
+        called before calling read_entry.
+        @raises ValueError: if the archive appears truncated
+        """
+        while self.remaining:
+            data = self.fileobj.read(min(4096, self.remaining))
+            if not data:
+                raise ValueError("archive truncated")
+            self.remaining -= len(data)
+
+    def read(self, length=None):
+        """
+        @type length: int or None
+        @param length: number of bytes to read from the current file
+        @rtype: bytes
+        @returns: length or fewer bytes from the current file
+        """
+        if length is None:
+            length = self.remaining
+        else:
+            length = min(self.remaining, length)
+        data = self.fileobj.read(length)
+        self.remaining -= len(data)
+        return data
diff --git a/dedup/image.py b/dedup/image.py
index e05e7da..acbf0ca 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -50,7 +50,10 @@ class ImageHash(object):
         pos = self.content.tell()
         try:
             self.content.seek(0)
-            img = PIL.Image.open(self.content)
+            try:
+                img = PIL.Image.open(self.content)
+            except IOError:
+                raise ValueError("broken png header")
             width, height = img.size
             pack = lambda elem: struct.pack("BBBB", *elem)
             # special casing easy modes reduces memory usage
@@ -60,8 +63,11 @@ class ImageHash(object):
                 pack = lambda elem: struct.pack("BBBB", *(elem + (255,)))
             elif img.mode != "RGBA":
                 img = img.convert("RGBA")
-            for elem in img.getdata():
-                hashobj.update(pack(elem))
+            try:
+                for elem in img.getdata():
+                    hashobj.update(pack(elem))
+            except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
+                raise ValueError("error reading png image")
         finally:
             self.content.seek(pos)
         return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)
diff --git a/importpkg.py b/importpkg.py
index 5d6a58c..e0160e6 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -2,7 +2,6 @@
 
 import hashlib
 import sqlite3
-import struct
 import sys
 import tarfile
 import zlib
@@ -11,62 +10,11 @@ from debian.debian_support import version_compare
 from debian import deb822
 import lzma
 
+from dedup.arreader import ArReader
 from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
 from dedup.compression import GzipDecompressor, DecompressedStream
 from dedup.image import ImageHash
 
-class ArReader(object):
-    global_magic = b"!<arch>\n"
-    file_magic = b"`\n"
-
-    def __init__(self, fileobj):
-        self.fileobj = fileobj
-        self.remaining = None
-        self.padding = 0
-
-    def skip(self, length):
-        while length:
-            data = self.fileobj.read(min(4096, length))
-            if not data:
-                raise ValueError("archive truncated")
-            length -= len(data)
-
-    def read_magic(self):
-        data = self.fileobj.read(len(self.global_magic))
-        if data != self.global_magic:
-            raise ValueError("ar global header not found")
-        self.remaining = 0
-
-    def read_entry(self):
-        self.skip_current_entry()
-        if self.padding:
-            if self.fileobj.read(1) != '\n':
-                raise ValueError("missing ar padding")
-            self.padding = 0
-        file_header = self.fileobj.read(60)
-        if not file_header:
-            raise EOFError("end of archive found")
-        parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header)
-        parts = [p.rstrip(" ") for p in parts]
-        if parts.pop() != self.file_magic:
-            raise ValueError("ar file header not found")
-        self.remaining = int(parts[5])
-        self.padding = self.remaining % 2
-        return parts[0] # name
-
-    def skip_current_entry(self):
-        self.skip(self.remaining)
-        self.remaining = 0
-
-    def read(self, length=None):
-        if length is None:
-            length = self.remaining
-        else:
-            length = min(self.remaining, length)
-        data = self.fileobj.read(length)
-        self.remaining -= len(data)
-        return data
-
 class MultiHash(object):
     def __init__(self, *hashes):
         self.hashes = hashes
diff --git a/webapp.py b/webapp.py
index 1da987b..dc439a1 100755
--- a/webapp.py
+++ b/webapp.py
@@ -21,7 +21,6 @@ hash_functions = [
 jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
 
 def format_size(size):
-    assert isinstance(size, int)
     size = float(size)
     fmt = "%d B"
     if size >= 1024:
@@ -241,7 +240,7 @@ class Application(object):
         details = dict(package=package,
                        version=version,
                        architecture=architecture)
-        cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ?;",
+        cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ? AND function = 'sha512';",
                     (package,))
         num_files, total_size = cur.fetchone()
         details.update(dict(num_files=num_files, total_size=total_size))