diff options
-rw-r--r-- | dedup/arreader.py | 79 | ||||
-rw-r--r-- | dedup/image.py | 12 | ||||
-rwxr-xr-x | importpkg.py | 54 | ||||
-rwxr-xr-x | webapp.py | 3 |
4 files changed, 90 insertions, 58 deletions
diff --git a/dedup/arreader.py b/dedup/arreader.py new file mode 100644 index 0000000..d74ae37 --- /dev/null +++ b/dedup/arreader.py @@ -0,0 +1,79 @@ +import struct + +class ArReader(object): + """Streaming AR file reader. After constructing an object, you usually + call read_magic once. Then you call read_entry in a loop and use the + ArReader object as file-like only providing read() to read the respective + file contents until you get EOFError from read_entry. + """ + global_magic = b"!<arch>\n" + file_magic = b"`\n" + + def __init__(self, fileobj): + """ + @param fileobj: a file-like object providing nothing but read(length) + """ + self.fileobj = fileobj + self.remaining = None + self.padding = 0 + + def read_magic(self): + """Consume the AR magic marker at the beginning of an AR file. You + must not call any other method before calling this method. + @raises ValueError: if the magic is not found + """ + data = self.fileobj.read(len(self.global_magic)) + if data != self.global_magic: + raise ValueError("ar global header not found") + self.remaining = 0 + + def read_entry(self): + """Read the next file header, return the filename and record the + length of the next file, so that the read method can be used to + exhaustively read the current file. + @rtype: bytes + @returns: the name of the next file + @raises ValueError: if the data format is wrong + @raises EOFError: when the end f the stream is reached + """ + self.skip_current_entry() + if self.padding: + if self.fileobj.read(1) != b'\n': + raise ValueError("missing ar padding") + self.padding = 0 + file_header = self.fileobj.read(60) + if not file_header: + raise EOFError("end of archive found") + parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header) + parts = [p.rstrip(b" ") for p in parts] + if parts.pop() != self.file_magic: + raise ValueError("ar file header not found") + self.remaining = int(parts[5]) + self.padding = self.remaining % 2 + return parts[0] # name + + def skip_current_entry(self): + """Skip the remainder of the current file. This method must not be + called before calling read_entry. + @raises ValueError: if the archive appears truncated + """ + while self.remaining: + data = self.fileobj.read(min(4096, self.remaining)) + if not data: + raise ValueError("archive truncated") + self.remaining -= len(data) + + def read(self, length=None): + """ + @type length: int or None + @param length: number of bytes to read from the current file + @rtype: bytes + @returns: length or fewer bytes from the current file + """ + if length is None: + length = self.remaining + else: + length = min(self.remaining, length) + data = self.fileobj.read(length) + self.remaining -= len(data) + return data diff --git a/dedup/image.py b/dedup/image.py index e05e7da..acbf0ca 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -50,7 +50,10 @@ class ImageHash(object): pos = self.content.tell() try: self.content.seek(0) - img = PIL.Image.open(self.content) + try: + img = PIL.Image.open(self.content) + except IOError: + raise ValueError("broken png header") width, height = img.size pack = lambda elem: struct.pack("BBBB", *elem) # special casing easy modes reduces memory usage @@ -60,8 +63,11 @@ class ImageHash(object): pack = lambda elem: struct.pack("BBBB", *(elem + (255,))) elif img.mode != "RGBA": img = img.convert("RGBA") - for elem in img.getdata(): - hashobj.update(pack(elem)) + try: + for elem in img.getdata(): + hashobj.update(pack(elem)) + except (SyntaxError, IndexError, IOError): # crazy stuff from PIL + raise ValueError("error reading png image") finally: self.content.seek(pos) return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height) diff --git a/importpkg.py b/importpkg.py index 5d6a58c..e0160e6 100755 --- a/importpkg.py +++ b/importpkg.py @@ -2,7 +2,6 @@ import hashlib import sqlite3 -import struct import sys import tarfile import zlib @@ -11,62 +10,11 @@ from debian.debian_support import version_compare from debian import deb822 import lzma +from dedup.arreader import ArReader from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file from dedup.compression import GzipDecompressor, DecompressedStream from dedup.image import ImageHash -class ArReader(object): - global_magic = b"!<arch>\n" - file_magic = b"`\n" - - def __init__(self, fileobj): - self.fileobj = fileobj - self.remaining = None - self.padding = 0 - - def skip(self, length): - while length: - data = self.fileobj.read(min(4096, length)) - if not data: - raise ValueError("archive truncated") - length -= len(data) - - def read_magic(self): - data = self.fileobj.read(len(self.global_magic)) - if data != self.global_magic: - raise ValueError("ar global header not found") - self.remaining = 0 - - def read_entry(self): - self.skip_current_entry() - if self.padding: - if self.fileobj.read(1) != '\n': - raise ValueError("missing ar padding") - self.padding = 0 - file_header = self.fileobj.read(60) - if not file_header: - raise EOFError("end of archive found") - parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header) - parts = [p.rstrip(" ") for p in parts] - if parts.pop() != self.file_magic: - raise ValueError("ar file header not found") - self.remaining = int(parts[5]) - self.padding = self.remaining % 2 - return parts[0] # name - - def skip_current_entry(self): - self.skip(self.remaining) - self.remaining = 0 - - def read(self, length=None): - if length is None: - length = self.remaining - else: - length = min(self.remaining, length) - data = self.fileobj.read(length) - self.remaining -= len(data) - return data - class MultiHash(object): def __init__(self, *hashes): self.hashes = hashes @@ -21,7 +21,6 @@ hash_functions = [ jinjaenv = jinja2.Environment(loader=jinja2.FileSystemLoader(".")) def format_size(size): - assert isinstance(size, int) size = float(size) fmt = "%d B" if size >= 1024: @@ -241,7 +240,7 @@ class Application(object): details = dict(package=package, version=version, architecture=architecture) - cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ?;", + cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ? AND function = 'sha512';", (package,)) num_files, total_size = cur.fetchone() details.update(dict(num_files=num_files, total_size=total_size)) |