From ad8baf9798d1e4f339d1bd61b178ae0d0f1c7751 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 20 Feb 2013 15:28:04 +0100 Subject: many improvements * multiple hashes * template engine * new table package * comparison view * hashvalue view --- test.py | 157 +++++++++++++++++++++++++++++++++++++++++++++++++++++---- webapp.py | 170 ++++++++++++++++++++++++++++++++++++++++++++++++-------------- 2 files changed, 280 insertions(+), 47 deletions(-) diff --git a/test.py b/test.py index ca9ed0b..733a52b 100755 --- a/test.py +++ b/test.py @@ -1,6 +1,7 @@ #!/usr/bin/python """ -CREATE TABLE content (package TEXT, version TEXT, architecture TEXT, filename TEXT, size INTEGER, hash TEXT); +CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT); +CREATE TABLE content (package TEXT, filename TEXT, size INTEGER, function TEXT, hash TEXT, FOREIGN KEY (package) REFERENCES package(package)); CREATE INDEX content_package_index ON content (package); CREATE INDEX content_hash_index ON content (hash); """ @@ -12,6 +13,7 @@ import sqlite3 import struct import sys import tarfile +import zlib import apt_pkg import lzma @@ -91,6 +93,123 @@ class XzStream(object): else: self.buff += self.decomp.flush() +class MultiHash(object): + def __init__(self, *hashes): + self.hashes = hashes + + def update(self, data): + for hasher in self.hashes: + hasher.update(data) + +class HashBlacklist(object): + def __init__(self, hasher, blacklist=set()): + self.hasher = hasher + self.blacklist = blacklist + self.update = self.hasher.update + self.name = hasher.name + + def hexdigest(self): + digest = self.hasher.hexdigest() + if digest in self.blacklist: + return None + return digest + +class GzipDecompressor(object): + def __init__(self): + self.inbuffer = b"" + self.decompressor = None # zlib.decompressobj(-zlib.MAX_WBITS) + + def decompress(self, data): + if self.decompressor: + data = self.decompressor.decompress(data) + if not self.decompressor.unused_data: + return data + unused_data = self.decompressor.unused_data + self.decompressor = None + return data + self.decompress(unused_data) + self.inbuffer += data + skip = 10 + if len(self.inbuffer) < skip: + return b"" + if not self.inbuffer.startswith(b"\037\213\010"): + raise ValueError("gzip magic not found") + flag = ord(self.inbuffer[3]) + if flag & 4: + if len(self.inbuffer) < skip + 2: + return b"" + length, = struct.unpack("= 1024: @@ -22,13 +26,63 @@ def format_size(size): fmt = "%.1f GB" return fmt % size +jinjaenv.filters["format_size"] = format_size + +package_template = jinjaenv.from_string( +"""duplication of {{ package|e }} +

{{ package|e }}

+

Version: {{ version|e }}

+

Architecture: {{ architecture|e }}

+

Number of files: {{ num_files }}

+

Total size: {{ total_size|format_size }}

+{%- if shared -%} + {%- for function, sharing in shared.items() -%} +

sharing with respect to {{ function }}

+ + {%- for entry in sharing|sort(attribute="savable", reverse=true) -%} + + + + {%- endfor -%} +
packagefiles shareddata shared
{% if entry.package %}{{ entry.package|e }}{% else %}self{% endif %} + compare{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%){{ entry.savable|format_size }} ({{ (100 * entry.savable / total_size)|int }}%)
+ {%- endfor -%} +{%- endif -%} +{{ content }} +""") + +detail_template = jinjaenv.from_string( +"""sharing between {{ details1.package|e }} and {{ details2.package|e }} +

{{ details1.package|e }} <-> {{ details2.package|e }}

+{%- if shared -%} + + {%- for entry in shared|sort(attribute="size", reverse=true) -%} + + {%- endfor -%} +
sizefilename in {{ details1.package|e }}filename in {{ details2.package|e }}hash functions
{{ entry.size|format_size }}{{ entry.filename1 }}{{ entry.filename2 }} + {%- for function, hashvalue in entry.functions.items() %}{{ function|e }} {% endfor %}
+{%- endif -%} +""") + +hash_template = jinjaenv.from_string( +"""information on {{ function|e }} hash {{ hashvalue|e }} +

{{ function|e }} {{ hashvalue|e }}

+ +{%- for entry in entries -%} + + +{%- endfor -%} +
packagefilenamesize
{{ entry.package|e }}{{ entry.filename|e }}{{ entry.size|format_size }}
+""") + class Application(object): def __init__(self): self.db = sqlite3.connect("test.sqlite3") self.cur = self.db.cursor() self.routingmap = Map([ - Rule("/", methods=("GET",), - endpoint="package"), + Rule("/binary/", methods=("GET",), endpoint="package"), + Rule("/compare//", methods=("GET",), endpoint="detail"), + Rule("/hash//", methods=("GET",), endpoint="hash"), ]) @Request.application @@ -36,55 +90,95 @@ class Application(object): mapadapter = self.routingmap.bind_to_environ(request.environ) try: endpoint, args = mapadapter.match() - assert endpoint == "package" - return self.show_package(args["package"]) + if endpoint == "package": + return self.show_package(args["package"]) + elif endpoint == "detail": + return self.show_detail(args["package1"], args["package2"]) + elif endpoint == "hash": + return self.show_hash(args["function"], args["hashvalue"]) + raise NotFound() except HTTPException as e: return e - def show_package(self, package): - self.cur.execute("SELECT version, architecture FROM content WHERE package = ? LIMIT 1;", (package,)) + def get_details(self, package): + self.cur.execute("SELECT version, architecture FROM package WHERE package = ?;", + (package,)) row = self.cur.fetchone() if not row: raise NotFound() version, architecture = row - self.cur.execute("SELECT count(filename) FROM content WHERE package = ?;", (package,)) - num_files = self.cur.fetchone()[0] - self.cur.execute("SELECT sum(size) FROM content WHERE package = ?;", (package,)) - total_size = self.cur.fetchone()[0] - content = "

Version: %s

Architecture: %s

" % (version, architecture) - content += "

Number of files: %d

" % num_files - content += "

Total size: %s

" % format_size(total_size) + details = dict(package=package, + version=version, + architecture=architecture) + self.cur.execute("SELECT count(filename), sum(size) FROM content WHERE package = ?;", + (package,)) + num_files, total_size = self.cur.fetchone() + details.update(dict(num_files=num_files, total_size=total_size)) + return details + + def show_package(self, package): + params = self.get_details(package) shared = dict() - self.cur.execute("SELECT a.filename, a.hash, a.size, b.package FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND (a.filename != b.filename OR b.package != ?);", (package, package)) - for afile, hashval, size, bpkg in self.cur.fetchall(): - shared.setdefault(bpkg, dict()).setdefault(hashval, (size, set()))[1].add(afile) + self.cur.execute("SELECT a.filename, a.function, a.hash, a.size, b.package FROM content AS a JOIN content AS b ON a.function = b.function AND a.hash = b.hash WHERE a.package = ? AND (a.filename != b.filename OR b.package != ?);", + (package, package)) + for afile, function, hashval, size, bpkg in self.cur.fetchall(): + pkgdict = shared.setdefault(function, dict()) + hashdict = pkgdict.setdefault(bpkg, dict()) + fileset = hashdict.setdefault(hashval, (size, set()))[1] + fileset.add(afile) + sharedstats = {} if shared: - sharedstats = [] - mapping = shared.pop(package, dict()) - if mapping: - duplicate = sum(len(files) for _, files in mapping.values()) - savable = sum(size * (len(files) - 1) for size, files in mapping.values()) - sharedstats.append(("self", duplicate, savable)) - for pkg, mapping in shared.items(): - pkglink = '%s' % (pkg, pkg) - duplicate = sum(len(files) for _, files in mapping.values()) - savable = sum(size * len(files) for size, files in mapping.values()) - sharedstats.append((pkglink, duplicate, savable)) - sharedstats.sort(key=lambda row: row[2], reverse=True) - content += "" - for pkg, duplicate, savable in sharedstats: - content += "" % (pkg, duplicate, 100. * duplicate / num_files, format_size(savable), 100. * savable / total_size) - content += "
packagefiles shareddata shared
%s%d (%d%%)%s (%d%%)
" - - r = Response(content_type="text/html") - r.data = "duplication of %(package)s

%(package)s

%(content)s" % dict(package=package, content=content) - return r + for function, sharing in shared.items(): + sharedstats[function] = list() + mapping = sharing.pop(package, dict()) + if mapping: + duplicate = sum(len(files) for _, files in mapping.values()) + savable = sum(size * (len(files) - 1) for size, files in mapping.values()) + sharedstats[function].append(dict(package=None, duplicate=duplicate, savable=savable)) + for pkg, mapping in sharing.items(): + duplicate = sum(len(files) for _, files in mapping.values()) + savable = sum(size * len(files) for size, files in mapping.values()) + sharedstats[function].append(dict(package=pkg, duplicate=duplicate, savable=savable)) + + params["shared"] = sharedstats + return Response(package_template.render(**params).encode("utf8"), + content_type="text/html") + + def show_detail(self, package1, package2): + details1 = self.get_details(package1) + details2 = self.get_details(package2) + + self.cur.execute("SELECT a.filename, b.filename, a.size, a.function, a.hash FROM content AS a JOIN content AS b ON a.function = b.function AND a.hash = b.hash WHERE a.package = ? AND b.package = ? AND a.filename != b.filename;", + (package1, package2)) + shared = dict() + for filename1, filename2, size, function, hashvalue in self.cur.fetchall(): + shared.setdefault((filename1, filename2, size), dict())[function] = hashvalue + shared = [dict(filename1=filename1, filename2=filename2, size=size, + functions=functions) + for (filename1, filename2, size), functions in shared.items()] + params = dict( + details1=details1, + details2=details2, + shared=shared) + return Response(detail_template.render(**params).encode("utf8"), + content_type="text/html") + + def show_hash(self, function, hashvalue): + self.cur.execute("SELECT package, filename, size FROM content WHERE function = ? AND hash = ?;", + (function, hashvalue)) + entries = [dict(package=package, filename=filename, size=size) + for package, filename, size in self.cur.fetchall()] + if not entries: + raise NotFound() + params = dict(function=function, hashvalue=hashvalue, entries=entries) + return Response(hash_template.render(**params).encode("utf8"), + content_type="text/html") def main(): app = Application() - app = DebuggedApplication(app, evalex=True) - make_server("localhost", 8800, app).serve_forever() + #app = DebuggedApplication(app, evalex=True) + make_server("0.0.0.0", 8800, app).serve_forever() if __name__ == "__main__": main() -- cgit v1.2.3