diff options
-rw-r--r-- | README | 4 | ||||
-rwxr-xr-x | autoimport.py | 12 | ||||
-rw-r--r-- | dedup/image.py | 67 | ||||
-rwxr-xr-x | importpkg.py | 16 | ||||
-rwxr-xr-x | webapp.py | 45 |
5 files changed, 126 insertions, 18 deletions
@@ -1,7 +1,7 @@ Required packages ----------------- -aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 +aptitude install python python-debian python-lzma curl python-jinja2 python-werkzeug sqlite3 python-imaging Create a database ----------------- @@ -12,7 +12,7 @@ Import packages --------------- Import individual packages by feeding them to importpkg.py: - ls -t /var/cache/apt/archives/*.deb | while read f; echo $f; ./importpkg.py < $f || break; done + ls -t /var/cache/apt/archives/*.deb | while read f; do echo $f; ./importpkg.py < $f || break; done Import a full mirror:: diff --git a/autoimport.py b/autoimport.py index 6ce7146..a7ea5ff 100755 --- a/autoimport.py +++ b/autoimport.py @@ -8,6 +8,7 @@ import sys import urllib from debian import deb822 +from debian.debian_support import version_compare def main(): urlbase = sys.argv[1] @@ -22,7 +23,8 @@ def main(): for pkg in deb822.Packages.iter_paragraphs(io.BytesIO(pkglist)): name = pkg["Package"] distpkgs.add(name) - if pkg["Version"] == knownpkgs.get(name, ()): + if name in knownpkgs and \ + version_compare(pkg["Version"], knownpkgs[name]) <= 0: continue pkgurl = "%s/%s" % (urlbase, pkg["Filename"]) print("importing %s" % name) @@ -33,9 +35,15 @@ def main(): if dl.wait(): print("curl failed") + delpkgs = set(knownpkgs) - distpkgs + print("clearing packages %s" % " ".join(delpkgs)) cur.execute("PRAGMA foreign_keys=1;") + cur.executemany("DELETE FROM content WHERE package = ?;", + ((pkg,) for pkg in delpkgs)) + cur.executemany("DELETE FROM dependency WHERE package = ?;", + ((pkg,) for pkg in delpkgs)) cur.executemany("DELETE FROM package WHERE package = ?;", - ((pkg,) for pkg in set(knownpkgs) - distpkgs)) + ((pkg,) for pkg in delpkgs)) db.commit() if __name__ == "__main__": diff --git a/dedup/image.py b/dedup/image.py new file mode 100644 index 0000000..e05e7da --- /dev/null +++ b/dedup/image.py @@ -0,0 +1,67 @@ +import io +import struct + +import PIL.Image + +class ImageHash(object): + """A hash on the contents of an image. This disregards mode, depth and meta + information. Note that due to limitations in PIL and the image format + (interlacing) the full contents are stored and decoded in hexdigest.""" + maxsize = 1024 * 1024 * 32 + # max memory usage is about 5 * maxpixels in bytes + maxpixels = 1024 * 1024 * 32 + + def __init__(self, hashobj): + """ + @param hashobj: a hashlib-like object + """ + self.hashobj = hashobj + self.imagedetected = False + self.content = io.BytesIO() + + def update(self, data): + self.content.write(data) + if self.content.tell() > self.maxsize: + raise ValueError("maximum image size exceeded") + if self.imagedetected: + return + if self.content.tell() < 33: # header + IHDR + return + curvalue = self.content.getvalue() + if curvalue.startswith(b"\x89PNG\r\n\x1a\n\0\0\0\x0dIHDR"): + width, height = struct.unpack(">II", curvalue[16:24]) + if width * height > self.maxpixels: + raise ValueError("maximum image pixels exceeded") + self.imagedetected = True + return + raise ValueError("not a png image") + + def copy(self): + new = ImageHash() + new.hashobj = self.hashobj.copy() + new.imagedetected = self.imagedetected + new.content = io.BytesIO(self.content.getvalue()) + return new + + def hexdigest(self): + if not self.imagedetected: + raise ValueError("not a png image") + hashobj = self.hashobj.copy() + pos = self.content.tell() + try: + self.content.seek(0) + img = PIL.Image.open(self.content) + width, height = img.size + pack = lambda elem: struct.pack("BBBB", *elem) + # special casing easy modes reduces memory usage + if img.mode == "L": + pack = lambda elem: struct.pack("BBBB", elem, elem, elem, 255) + elif img.mode == "RGB": + pack = lambda elem: struct.pack("BBBB", *(elem + (255,))) + elif img.mode != "RGBA": + img = img.convert("RGBA") + for elem in img.getdata(): + hashobj.update(pack(elem)) + finally: + self.content.seek(pos) + return "%s%8.8x%8.8x" % (hashobj.hexdigest(), width, height) diff --git a/importpkg.py b/importpkg.py index 89020b9..d626fba 100755 --- a/importpkg.py +++ b/importpkg.py @@ -20,6 +20,7 @@ import lzma from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, hash_file from dedup.compression import GzipDecompressor, DecompressedStream +from dedup.image import ImageHash class ArReader(object): global_magic = b"!<arch>\n" @@ -96,11 +97,17 @@ def gziphash(): hashobj.name = "gzip_sha512" return HashBlacklist(hashobj, boring_sha512_hashes) +def imagehash(): + hashobj = ImageHash(hashlib.sha512()) + hashobj = SuppressingHash(hashobj, (ValueError,)) + hashobj.name = "image_sha512" + return hashobj + def get_hashes(tar): for elem in tar: if not elem.isreg(): # excludes hard links as well continue - hasher = MultiHash(sha512_nontrivial(), gziphash()) + hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash()) hasher = hash_file(hasher, tar.extractfile(elem)) for hashobj in hasher.hashes: hashvalue = hashobj.hexdigest() @@ -167,8 +174,13 @@ def process_package(db, filelike): if state != "control_file": raise ValueError("missing control file") for name, size, function, hexhash in get_hashes(tf): + try: + name = name.decode("utf8") + except UnicodeDecodeError: + print("warning: skipping filename with encoding error") + continue # skip files with non-utf8 encoding for now cur.execute("INSERT INTO content (package, filename, size, function, hash) VALUES (?, ?, ?, ?, ?);", - (package, name.decode("utf8"), size, function, hexhash)) + (package, name, size, function, hexhash)) db.commit() return raise ValueError("data.tar not found") @@ -1,5 +1,6 @@ #!/usr/bin/python +import datetime import sqlite3 from wsgiref.simple_server import make_server @@ -10,6 +11,7 @@ from werkzeug.wrappers import Request, Response hash_functions = [ ("sha512", "sha512"), + ("image_sha512", "image_sha512"), ("gzip_sha512", "gzip_sha512"), ("sha512", "gzip_sha512"), ("gzip_sha512", "sha512")] @@ -104,6 +106,29 @@ index_template = jinjaenv.from_string( </ul> {% endblock %}""") +def fetchiter(cursor): + rows = cursor.fetchmany() + while rows: + for row in rows: + yield row + rows = cursor.fetchmany() + +def encode_and_buffer(iterator): + buff = b"" + for elem in iterator: + buff += elem.encode("utf8") + if len(buff) >= 2048: + yield buff + buff = b"" + if buff: + yield buff + +def html_response(unicode_iterator, max_age=24 * 60 * 60): + resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html") + resp.cache_control.max_age = max_age + resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age) + return resp + class Application(object): def __init__(self): self.db = sqlite3.connect("test.sqlite3") @@ -129,8 +154,7 @@ class Application(object): elif endpoint == "index": if not request.environ["PATH_INFO"]: raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/") - return Response(index_template.render().encode("utf8"), - content_type="text/html") + return html_response(index_template.stream()) raise NotFound() except HTTPException as e: return e @@ -154,7 +178,7 @@ class Application(object): def get_dependencies(self, package): self.cur.execute("SELECT required FROM dependency WHERE package = ?;", (package,)) - return set(row[0] for row in self.cur.fetchall()) + return set(row[0] for row in fetchiter(self.cur)) def show_package(self, package): params = self.get_details(package) @@ -165,7 +189,7 @@ class Application(object): self.cur.execute("SELECT a.filename, a.hash, a.size, b.package FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND a.function = ? AND b.function = ? AND (a.filename != b.filename OR b.package != ?);", (package, func1, func2, package)) sharing = dict() - for afile, hashval, size, bpkg in self.cur.fetchall(): + for afile, hashval, size, bpkg in fetchiter(self.cur): hashdict = sharing.setdefault(bpkg, dict()) fileset = hashdict.setdefault(hashval, (size, set()))[1] fileset.add(afile) @@ -186,8 +210,7 @@ class Application(object): curstats.append(dict(package=pkg, duplicate=duplicate, savable=savable)) params["shared"] = sharedstats - return Response(package_template.render(**params).encode("utf8"), - content_type="text/html") + return html_response(package_template.render(params)) def show_detail(self, package1, package2): if package1 == package2: @@ -203,7 +226,7 @@ class Application(object): (package1, package2)) shared = dict() - for filename1, size1, func1, filename2, size2, func2, hashvalue in self.cur.fetchall(): + for filename1, size1, func1, filename2, size2, func2, hashvalue in fetchiter(self.cur): funccomb = (func1, func2) if funccomb not in hash_functions: continue @@ -218,21 +241,19 @@ class Application(object): details1=details1, details2=details2, shared=shared) - return Response(detail_template.render(**params).encode("utf8"), - content_type="text/html") + return html_response(detail_template.render(params)) def show_hash(self, function, hashvalue): self.cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;", (hashvalue,)) entries = [dict(package=package, filename=filename, size=size, function=otherfunc) - for package, filename, size, otherfunc in self.cur.fetchall() + for package, filename, size, otherfunc in fetchiter(self.cur) if (function, otherfunc) in hash_functions] if not entries: raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries) - return Response(hash_template.render(**params).encode("utf8"), - content_type="text/html") + return html_response(hash_template.render(params)) def main(): app = Application() |