summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--dedup/arreader.py79
-rw-r--r--dedup/image.py12
-rwxr-xr-ximportpkg.py54
-rwxr-xr-xwebapp.py3
4 files changed, 90 insertions, 58 deletions
diff --git a/dedup/arreader.py b/dedup/arreader.py
new file mode 100644
index 0000000..d74ae37
--- /dev/null
+++ b/dedup/arreader.py
@@ -0,0 +1,79 @@
+import struct
+
+class ArReader(object):
+ """Streaming AR file reader. After constructing an object, you usually
+ call read_magic once. Then you call read_entry in a loop and use the
+ ArReader object as file-like only providing read() to read the respective
+ file contents until you get EOFError from read_entry.
+ """
+ global_magic = b"!<arch>\n"
+ file_magic = b"`\n"
+
+ def __init__(self, fileobj):
+ """
+ @param fileobj: a file-like object providing nothing but read(length)
+ """
+ self.fileobj = fileobj
+ self.remaining = None
+ self.padding = 0
+
+ def read_magic(self):
+ """Consume the AR magic marker at the beginning of an AR file. You
+ must not call any other method before calling this method.
+ @raises ValueError: if the magic is not found
+ """
+ data = self.fileobj.read(len(self.global_magic))
+ if data != self.global_magic:
+ raise ValueError("ar global header not found")
+ self.remaining = 0
+
+ def read_entry(self):
+ """Read the next file header, return the filename and record the
+ length of the next file, so that the read method can be used to
+ exhaustively read the current file.
+ @rtype: bytes
+ @returns: the name of the next file
+ @raises ValueError: if the data format is wrong
+ @raises EOFError: when the end f the stream is reached
+ """
+ self.skip_current_entry()
+ if self.padding:
+ if self.fileobj.read(1) != b'\n':
+ raise ValueError("missing ar padding")
+ self.padding = 0
+ file_header = self.fileobj.read(60)
+ if not file_header:
+ raise EOFError("end of archive found")
+ parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header)
+ parts = [p.rstrip(b" ") for p in parts]
+ if parts.pop() != self.file_magic:
+ raise ValueError("ar file header not found")
+ self.remaining = int(parts[5])
+ self.padding = self.remaining % 2
+ return parts[0] # name
+
+ def skip_current_entry(self):
+ """Skip the remainder of the current file. This method must not be
+ called before calling read_entry.
+ @raises ValueError: if the archive appears truncated
+ """
+ while self.remaining:
+ data = self.fileobj.read(min(4096, self.remaining))
+ if not data:
+ raise ValueError("archive truncated")
+ self.remaining -= len(data)
+
+ def read(self, length=None):
+ """
+ @type length: int or None
+ @param length: number of bytes to read from the current file
+ @rtype: bytes
+ @returns: length or fewer bytes from the current file
+ """
+ if length is None:
+ length = self.remaining
+ else:
+ length = min(self.remaining, length)
+ data = self.fileobj.read(length)
+ self.remaining -= len(data)
+ return data
diff --git a/dedup/image.py b/dedup/image.py
index e05e7da..acbf0ca 100644
--- a/dedup/image.py
+++ b/dedup/image.py
@@ -50,7 +50,10 @@ class ImageHash(object):
pos = self.content.tell()
try:
self.content.seek(0)
- img = PIL.Image.open(self.content)
+ try:
+ img = PIL.Image.open(self.content)
+ except IOError:
+ raise ValueError("broken png header")
width, height = img.size
pack = lambda elem: struct.pack("BBBB", *elem)
# special casing easy modes reduces memory usage
@@ -60,8 +63,11 @@ class ImageHash(object):
pack = lambda elem: struct.pack("BBBB", *(elem + (255,)))
elif img.mode != "RGBA":
img = img.convert("RGBA")
- for elem in img.getdata():
- hashobj.update(pack(elem))
+ try:
+ for elem in img.getdata():
+ hashobj.update(pack(elem))
+ except (SyntaxError, IndexError, IOError): # crazy stuff from PIL
+ raise ValueError("error reading png image")
finally:
self.content.seek(pos)
return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height)
diff --git a/importpkg.py b/importpkg.py
index 5d6a58c..e0160e6 100755
--- a/importpkg.py
+++ b/importpkg.py
@@ -2,7 +2,6 @@
import hashlib
import sqlite3
-import struct
import sys
import tarfile
import zlib
@@ -11,62 +10,11 @@ from debian.debian_support import version_compare
from debian import deb822
import lzma
+from dedup.arreader import ArReader
from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file
from dedup.compression import GzipDecompressor, DecompressedStream
from dedup.image import ImageHash
-class ArReader(object):
- global_magic = b"!<arch>\n"
- file_magic = b"`\n"
-
- def __init__(self, fileobj):
- self.fileobj = fileobj
- self.remaining = None
- self.padding = 0
-
- def skip(self, length):
- while length:
- data = self.fileobj.read(min(4096, length))
- if not data:
- raise ValueError("archive truncated")
- length -= len(data)
-
- def read_magic(self):
- data = self.fileobj.read(len(self.global_magic))
- if data != self.global_magic:
- raise ValueError("ar global header not found")
- self.remaining = 0
-
- def read_entry(self):
- self.skip_current_entry()
- if self.padding:
- if self.fileobj.read(1) != '\n':
- raise ValueError("missing ar padding")
- self.padding = 0
- file_header = self.fileobj.read(60)
- if not file_header:
- raise EOFError("end of archive found")
- parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header)
- parts = [p.rstrip(" ") for p in parts]
- if parts.pop() != self.file_magic:
- raise ValueError("ar file header not found")
- self.remaining = int(parts[5])
- self.padding = self.remaining % 2
- return parts[0] # name
-
- def skip_current_entry(self):
- self.skip(self.remaining)
- self.remaining = 0
-
- def read(self, length=None):
- if length is None:
- length = self.remaining
- else:
- length = min(self.remaining, length)
- data = self.fileobj.read(length)
- self.remaining -= len(data)
- return data
-
class MultiHash(object):
def __init__(self, *hashes):
self.hashes = hashes
diff --git a/webapp.py b/webapp.py
index 1da987b..dc439a1 100755
--- a/webapp.py
+++ b/webapp.py
@@ -21,7 +21,6 @@ hash_functions = [
jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader("."))
def format_size(size):
- assert isinstance(size, int)
size = float(size)
fmt = "%d B"
if size >= 1024:
@@ -241,7 +240,7 @@ class Application(object):
details = dict(package=package,
version=version,
architecture=architecture)
- cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ?;",
+ cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ? AND function = 'sha512';",
(package,))
num_files, total_size = cur.fetchone()
details.update(dict(num_files=num_files, total_size=total_size))