From 27b95909f061ae3ecb3ba1b8d46adfef98ca5e6f Mon Sep 17 00:00:00 2001 From: Helmut Grohne <helmut@subdivi.de> Date: Sun, 16 Feb 2020 08:21:20 +0100 Subject: drop support for Python 2.x --- webapp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index f9e667e..69e9df8 100755 --- a/webapp.py +++ b/webapp.py @@ -1,4 +1,4 @@ -#!/usr/bin/python +#!/usr/bin/python3 import argparse import contextlib @@ -68,7 +68,7 @@ class InternalRedirect(Exception): self.target = target self.code = code -class Application(object): +class Application: def __init__(self, db): self.db = db self.routingmap = Map([ -- cgit v1.2.3 From c7615fcb537f547da3068d3e489437e70db58447 Mon Sep 17 00:00:00 2001 From: Helmut Grohne <helmut@subdivi.de> Date: Wed, 29 Dec 2021 20:34:51 +0100 Subject: webapp: forward compatibility with newer werkzeug --- webapp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index 69e9df8..8f5d342 100755 --- a/webapp.py +++ b/webapp.py @@ -11,7 +11,10 @@ from werkzeug.exceptions import HTTPException, NotFound from werkzeug.routing import Map, Rule from werkzeug.utils import redirect from werkzeug.wrappers import Request, Response -from werkzeug.wsgi import SharedDataMiddleware +try: + from werkzeug.middleware.shared_data import SharedDataMiddleware +except ImportError: + from werkzeug.wsgi import SharedDataMiddleware from dedup.utils import fetchiter -- cgit v1.2.3 From e118de84d60e6f0d7662dcbb6aa362f452dda6ba Mon Sep 17 00:00:00 2001 From: Helmut Grohne <helmut@subdivi.de> Date: Wed, 29 Dec 2021 20:56:03 +0100 Subject: webapp: improve performance html_response expects a str-generator, but when we call the render method, we receive a plain str. It can be iterated - one character at a time. That's what encode_and_buffer will do in this case. So better stream all the time. --- webapp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index 8f5d342..9993cb0 100755 --- a/webapp.py +++ b/webapp.py @@ -100,7 +100,7 @@ class Application: elif endpoint == "index": if not request.environ["PATH_INFO"]: raise InternalRedirect("/") - return html_response(index_template.render(dict(urlroot=""))) + return html_response(index_template.stream(dict(urlroot=""))) elif endpoint == "source": return self.show_source(args["package"]) raise NotFound() @@ -159,7 +159,7 @@ class Application: (params["pid"],)) params["issues"] = dict(cur.fetchall()) cur.close() - return html_response(package_template.render(params)) + return html_response(package_template.stream(params)) def compute_comparison(self, pid1, pid2): """Compute a sequence of comparison objects ordered by the size of the @@ -237,7 +237,7 @@ class Application: raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries, urlroot="../..") - return html_response(hash_template.render(params)) + return html_response(hash_template.stream(params)) def show_source(self, package): with contextlib.closing(self.db.cursor()) as cur: @@ -256,7 +256,7 @@ class Application: if not (oldentry and oldentry["savable"] >= size): binpkgs[binary] = entry params = dict(source=package, packages=binpkgs, urlroot="..") - return html_response(source_template.render(params)) + return html_response(source_template.stream(params)) def main(): parser = argparse.ArgumentParser() -- cgit v1.2.3 From 69a8861b704c969260ecb55110d8e41cd9aaf0a7 Mon Sep 17 00:00:00 2001 From: Helmut Grohne <helmut@subdivi.de> Date: Wed, 29 Dec 2021 21:00:04 +0100 Subject: webapp: speed up encode_and_buffer We now know that our parameter is a jinja2.environment.TemplateStream. Enable buffering and accumulate via an io.BytesIO to avoid O(n^2) append. --- webapp.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index 9993cb0..d91d724 100755 --- a/webapp.py +++ b/webapp.py @@ -3,6 +3,7 @@ import argparse import contextlib import datetime +import io import sqlite3 from wsgiref.simple_server import make_server @@ -49,15 +50,16 @@ hash_template = jinjaenv.get_template("hash.html") index_template = jinjaenv.get_template("index.html") source_template = jinjaenv.get_template("source.html") -def encode_and_buffer(iterator): - buff = b"" - for elem in iterator: - buff += elem.encode("utf8") - if len(buff) >= 2048: - yield buff - buff = b"" - if buff: - yield buff +def encode_and_buffer(stream): + stream.enable_buffering(16) + buff = io.BytesIO() + for elem in stream: + buff.write(elem.encode("utf8")) + if buff.tell() >= 2048: + yield buff.getvalue() + buff = io.BytesIO() + if buff.tell() > 0: + yield buff.getvalue() def html_response(unicode_iterator, max_age=24 * 60 * 60): resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html") -- cgit v1.2.3 From 9b2cd74186f74a3c3e7c10b0ce39ebd992b11d36 Mon Sep 17 00:00:00 2001 From: Helmut Grohne <helmut@subdivi.de> Date: Wed, 29 Dec 2021 21:14:38 +0100 Subject: webapp: avoid changing variable type Again static type checking is the driver for the change here. --- webapp.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index d91d724..0d9e3f9 100755 --- a/webapp.py +++ b/webapp.py @@ -22,18 +22,18 @@ from dedup.utils import fetchiter jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates")) def format_size(size): - size = float(size) + sizef = float(size) fmt = "%d B" - if size >= 1024: - size /= 1024 + if sizef >= 1024: + sizef /= 1024 fmt = "%.1f KB" - if size >= 1024: - size /= 1024 + if sizef >= 1024: + sizef /= 1024 fmt = "%.1f MB" - if size >= 1024: - size /= 1024 + if sizef >= 1024: + sizef /= 1024 fmt = "%.1f GB" - return fmt % size + return fmt % sizef def function_combination(function1, function2): if function1 == function2: -- cgit v1.2.3 From 1631e91b116ebf04ba9bd332e12c2f165263088b Mon Sep 17 00:00:00 2001 From: Helmut Grohne <helmut@subdivi.de> Date: Fri, 31 Dec 2021 13:00:29 +0100 Subject: webapp.py: consistently close cursors using context managers --- webapp.py | 72 +++++++++++++++++++++++++++++++-------------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index 0d9e3f9..162a5a4 100755 --- a/webapp.py +++ b/webapp.py @@ -84,6 +84,9 @@ class Application: Rule("/source/<package>", methods=("GET",), endpoint="source"), ]) + def cursor(self): + return contextlib.closing(self.db.cursor()) + @Request.application def __call__(self, request): mapadapter = self.routingmap.bind_to_environ(request.environ) @@ -112,7 +115,7 @@ class Application: return e def get_details(self, package): - with contextlib.closing(self.db.cursor()) as cur: + with self.cursor() as cur: cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;", (package,)) row = cur.fetchone() @@ -132,14 +135,14 @@ class Application: return details def get_dependencies(self, pid): - with contextlib.closing(self.db.cursor()) as cur: + with self.cursor() as cur: cur.execute("SELECT required FROM dependency WHERE pid = ?;", (pid,)) return set(row[0] for row in fetchiter(cur)) def cached_sharedstats(self, pid): sharedstats = {} - with contextlib.closing(self.db.cursor()) as cur: + with self.cursor() as cur: cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", (pid,)) for pid2, package2, func1, func2, files, size in fetchiter(cur): @@ -156,11 +159,10 @@ class Application: params["dependencies"] = self.get_dependencies(params["pid"]) params["shared"] = self.cached_sharedstats(params["pid"]) params["urlroot"] = ".." - cur = self.db.cursor() - cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;", - (params["pid"],)) - params["issues"] = dict(cur.fetchall()) - cur.close() + with self.cursor() as cur: + cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;", + (params["pid"],)) + params["issues"] = dict(cur.fetchall()) return html_response(package_template.stream(params)) def compute_comparison(self, pid1, pid2): @@ -173,35 +175,33 @@ class Application: * matches: A mapping from filenames in package 2 (pid2) to a mapping from hash function pairs to hash values. """ - cur = self.db.cursor() - cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;", - (pid1,)) - cursize = -1 - files = dict() - minmatch = 2 if pid1 == pid2 else 1 - cur2 = self.db.cursor() - for cid, filename, size, hashvalue in fetchiter(cur): - if cursize != size: - for entry in files.values(): - if len(entry["matches"]) >= minmatch: - yield entry - files.clear() - cursize = size + with self.cursor() as cur, self.cursor() as cur2: + cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;", + (pid1,)) + cursize = -1 + files = dict() + minmatch = 2 if pid1 == pid2 else 1 + cur2 = self.db.cursor() + for cid, filename, size, hashvalue in fetchiter(cur): + if cursize != size: + for entry in files.values(): + if len(entry["matches"]) >= minmatch: + yield entry + files.clear() + cursize = size - if hashvalue in files: - files[hashvalue]["filenames"].add(filename) - continue + if hashvalue in files: + files[hashvalue]["filenames"].add(filename) + continue - entry = dict(filenames=set((filename,)), size=size, matches={}) - files[hashvalue] = entry + entry = dict(filenames=set((filename,)), size=size, matches={}) + files[hashvalue] = entry - cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;", - (cid, pid2)) - for func1, hashvalue, func2, filename in fetchiter(cur2): - entry["matches"].setdefault(filename, {})[func1, func2] = \ - hashvalue - cur2.close() - cur.close() + cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;", + (cid, pid2)) + for func1, hashvalue, func2, filename in fetchiter(cur2): + entry["matches"].setdefault(filename, {})[func1, func2] = \ + hashvalue for entry in files.values(): if len(entry["matches"]) >= minmatch: @@ -221,7 +221,7 @@ class Application: return html_response(detail_template.stream(params)) def show_hash(self, function, hashvalue): - with contextlib.closing(self.db.cursor()) as cur: + with self.cursor() as cur: cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", (function, hashvalue,)) entries = [dict(package=package, filename=filename, size=size, @@ -242,7 +242,7 @@ class Application: return html_response(hash_template.stream(params)) def show_source(self, package): - with contextlib.closing(self.db.cursor()) as cur: + with self.cursor() as cur: cur.execute("SELECT name FROM package WHERE source = ?;", (package,)) binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) -- cgit v1.2.3