From 27b95909f061ae3ecb3ba1b8d46adfef98ca5e6f Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Sun, 16 Feb 2020 08:21:20 +0100
Subject: drop support for Python 2.x

---
 webapp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'webapp.py')

diff --git a/webapp.py b/webapp.py
index f9e667e..69e9df8 100755
--- a/webapp.py
+++ b/webapp.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/python3
 
 import argparse
 import contextlib
@@ -68,7 +68,7 @@ class InternalRedirect(Exception):
         self.target = target
         self.code = code
 
-class Application(object):
+class Application:
     def __init__(self, db):
         self.db = db
         self.routingmap = Map([
-- 
cgit v1.2.3


From c7615fcb537f547da3068d3e489437e70db58447 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Wed, 29 Dec 2021 20:34:51 +0100
Subject: webapp: forward compatibility with newer werkzeug

---
 webapp.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'webapp.py')

diff --git a/webapp.py b/webapp.py
index 69e9df8..8f5d342 100755
--- a/webapp.py
+++ b/webapp.py
@@ -11,7 +11,10 @@ from werkzeug.exceptions import HTTPException, NotFound
 from werkzeug.routing import Map, Rule
 from werkzeug.utils import redirect
 from werkzeug.wrappers import Request, Response
-from werkzeug.wsgi import SharedDataMiddleware
+try:
+    from werkzeug.middleware.shared_data import SharedDataMiddleware
+except ImportError:
+    from werkzeug.wsgi import SharedDataMiddleware
 
 from dedup.utils import fetchiter
 
-- 
cgit v1.2.3


From e118de84d60e6f0d7662dcbb6aa362f452dda6ba Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Wed, 29 Dec 2021 20:56:03 +0100
Subject: webapp: improve performance

html_response expects a str-generator, but when we call the render
method, we receive a plain str. It can be iterated - one character at a
time. That's what encode_and_buffer will do in this case. So better
stream all the time.
---
 webapp.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'webapp.py')

diff --git a/webapp.py b/webapp.py
index 8f5d342..9993cb0 100755
--- a/webapp.py
+++ b/webapp.py
@@ -100,7 +100,7 @@ class Application:
             elif endpoint == "index":
                 if not request.environ["PATH_INFO"]:
                     raise InternalRedirect("/")
-                return html_response(index_template.render(dict(urlroot="")))
+                return html_response(index_template.stream(dict(urlroot="")))
             elif endpoint == "source":
                 return self.show_source(args["package"])
             raise NotFound()
@@ -159,7 +159,7 @@ class Application:
                     (params["pid"],))
         params["issues"] = dict(cur.fetchall())
         cur.close()
-        return html_response(package_template.render(params))
+        return html_response(package_template.stream(params))
 
     def compute_comparison(self, pid1, pid2):
         """Compute a sequence of comparison objects ordered by the size of the
@@ -237,7 +237,7 @@ class Application:
                 raise NotFound()
         params = dict(function=function, hashvalue=hashvalue, entries=entries,
                       urlroot="../..")
-        return html_response(hash_template.render(params))
+        return html_response(hash_template.stream(params))
 
     def show_source(self, package):
         with contextlib.closing(self.db.cursor()) as cur:
@@ -256,7 +256,7 @@ class Application:
                 if not (oldentry and oldentry["savable"] >= size):
                     binpkgs[binary] = entry
         params = dict(source=package, packages=binpkgs, urlroot="..")
-        return html_response(source_template.render(params))
+        return html_response(source_template.stream(params))
 
 def main():
     parser = argparse.ArgumentParser()
-- 
cgit v1.2.3


From 69a8861b704c969260ecb55110d8e41cd9aaf0a7 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Wed, 29 Dec 2021 21:00:04 +0100
Subject: webapp: speed up encode_and_buffer

We now know that our parameter is a jinja2.environment.TemplateStream.
Enable buffering and accumulate via an io.BytesIO to avoid O(n^2)
append.
---
 webapp.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'webapp.py')

diff --git a/webapp.py b/webapp.py
index 9993cb0..d91d724 100755
--- a/webapp.py
+++ b/webapp.py
@@ -3,6 +3,7 @@
 import argparse
 import contextlib
 import datetime
+import io
 import sqlite3
 from wsgiref.simple_server import make_server
 
@@ -49,15 +50,16 @@ hash_template = jinjaenv.get_template("hash.html")
 index_template = jinjaenv.get_template("index.html")
 source_template = jinjaenv.get_template("source.html")
 
-def encode_and_buffer(iterator):
-    buff = b""
-    for elem in iterator:
-        buff += elem.encode("utf8")
-        if len(buff) >= 2048:
-            yield buff
-            buff = b""
-    if buff:
-        yield buff
+def encode_and_buffer(stream):
+    stream.enable_buffering(16)
+    buff = io.BytesIO()
+    for elem in stream:
+        buff.write(elem.encode("utf8"))
+        if buff.tell() >= 2048:
+            yield buff.getvalue()
+            buff = io.BytesIO()
+    if buff.tell() > 0:
+        yield buff.getvalue()
 
 def html_response(unicode_iterator, max_age=24 * 60 * 60):
     resp = Response(encode_and_buffer(unicode_iterator), mimetype="text/html")
-- 
cgit v1.2.3


From 9b2cd74186f74a3c3e7c10b0ce39ebd992b11d36 Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Wed, 29 Dec 2021 21:14:38 +0100
Subject: webapp: avoid changing variable type

Again static type checking is the driver for the change here.
---
 webapp.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'webapp.py')

diff --git a/webapp.py b/webapp.py
index d91d724..0d9e3f9 100755
--- a/webapp.py
+++ b/webapp.py
@@ -22,18 +22,18 @@ from dedup.utils import fetchiter
 jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates"))
 
 def format_size(size):
-    size = float(size)
+    sizef = float(size)
     fmt = "%d B"
-    if size >= 1024:
-        size /= 1024
+    if sizef >= 1024:
+        sizef /= 1024
         fmt = "%.1f KB"
-    if size >= 1024:
-        size /= 1024
+    if sizef >= 1024:
+        sizef /= 1024
         fmt = "%.1f MB"
-    if size >= 1024:
-        size /= 1024
+    if sizef >= 1024:
+        sizef /= 1024
         fmt = "%.1f GB"
-    return fmt % size
+    return fmt % sizef
 
 def function_combination(function1, function2):
     if function1 == function2:
-- 
cgit v1.2.3


From 1631e91b116ebf04ba9bd332e12c2f165263088b Mon Sep 17 00:00:00 2001
From: Helmut Grohne <helmut@subdivi.de>
Date: Fri, 31 Dec 2021 13:00:29 +0100
Subject: webapp.py: consistently close cursors using context managers

---
 webapp.py | 72 +++++++++++++++++++++++++++++++--------------------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

(limited to 'webapp.py')

diff --git a/webapp.py b/webapp.py
index 0d9e3f9..162a5a4 100755
--- a/webapp.py
+++ b/webapp.py
@@ -84,6 +84,9 @@ class Application:
             Rule("/source/<package>", methods=("GET",), endpoint="source"),
         ])
 
+    def cursor(self):
+        return contextlib.closing(self.db.cursor())
+
     @Request.application
     def __call__(self, request):
         mapadapter = self.routingmap.bind_to_environ(request.environ)
@@ -112,7 +115,7 @@ class Application:
             return e
 
     def get_details(self, package):
-        with contextlib.closing(self.db.cursor()) as cur:
+        with self.cursor() as cur:
             cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
                         (package,))
             row = cur.fetchone()
@@ -132,14 +135,14 @@ class Application:
         return details
 
     def get_dependencies(self, pid):
-        with contextlib.closing(self.db.cursor()) as cur:
+        with self.cursor() as cur:
             cur.execute("SELECT required FROM dependency WHERE pid = ?;",
                         (pid,))
             return set(row[0] for row in fetchiter(cur))
 
     def cached_sharedstats(self, pid):
         sharedstats = {}
-        with contextlib.closing(self.db.cursor()) as cur:
+        with self.cursor() as cur:
             cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
                         (pid,))
             for pid2, package2, func1, func2, files, size in fetchiter(cur):
@@ -156,11 +159,10 @@ class Application:
         params["dependencies"] = self.get_dependencies(params["pid"])
         params["shared"] = self.cached_sharedstats(params["pid"])
         params["urlroot"] = ".."
-        cur = self.db.cursor()
-        cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;",
-                    (params["pid"],))
-        params["issues"] = dict(cur.fetchall())
-        cur.close()
+        with self.cursor() as cur:
+            cur.execute("SELECT content.filename, issue.issue FROM content JOIN issue ON content.id = issue.cid WHERE content.pid = ?;",
+                        (params["pid"],))
+            params["issues"] = dict(cur.fetchall())
         return html_response(package_template.stream(params))
 
     def compute_comparison(self, pid1, pid2):
@@ -173,35 +175,33 @@ class Application:
          * matches: A mapping from filenames in package 2 (pid2) to a mapping
            from hash function pairs to hash values.
         """
-        cur = self.db.cursor()
-        cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;",
-                    (pid1,))
-        cursize = -1
-        files = dict()
-        minmatch = 2 if pid1 == pid2 else 1
-        cur2 = self.db.cursor()
-        for cid, filename, size, hashvalue in fetchiter(cur):
-            if cursize != size:
-                for entry in files.values():
-                    if len(entry["matches"]) >= minmatch:
-                        yield entry
-                files.clear()
-                cursize = size
+        with self.cursor() as cur, self.cursor() as cur2:
+            cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;",
+                        (pid1,))
+            cursize = -1
+            files = dict()
+            minmatch = 2 if pid1 == pid2 else 1
+            cur2 = self.db.cursor()
+            for cid, filename, size, hashvalue in fetchiter(cur):
+                if cursize != size:
+                    for entry in files.values():
+                        if len(entry["matches"]) >= minmatch:
+                            yield entry
+                    files.clear()
+                    cursize = size
 
-            if hashvalue in files:
-                files[hashvalue]["filenames"].add(filename)
-                continue
+                if hashvalue in files:
+                    files[hashvalue]["filenames"].add(filename)
+                    continue
 
-            entry = dict(filenames=set((filename,)), size=size, matches={})
-            files[hashvalue] = entry
+                entry = dict(filenames=set((filename,)), size=size, matches={})
+                files[hashvalue] = entry
 
-            cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;",
-                         (cid, pid2))
-            for func1, hashvalue, func2, filename in fetchiter(cur2):
-                entry["matches"].setdefault(filename, {})[func1, func2] = \
-                        hashvalue
-        cur2.close()
-        cur.close()
+                cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;",
+                             (cid, pid2))
+                for func1, hashvalue, func2, filename in fetchiter(cur2):
+                    entry["matches"].setdefault(filename, {})[func1, func2] = \
+                            hashvalue
 
         for entry in files.values():
             if len(entry["matches"]) >= minmatch:
@@ -221,7 +221,7 @@ class Application:
         return html_response(detail_template.stream(params))
 
     def show_hash(self, function, hashvalue):
-        with contextlib.closing(self.db.cursor()) as cur:
+        with self.cursor() as cur:
             cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
                         (function, hashvalue,))
             entries = [dict(package=package, filename=filename, size=size,
@@ -242,7 +242,7 @@ class Application:
         return html_response(hash_template.stream(params))
 
     def show_source(self, package):
-        with contextlib.closing(self.db.cursor()) as cur:
+        with self.cursor() as cur:
             cur.execute("SELECT name FROM package WHERE source = ?;",
                         (package,))
             binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur))
-- 
cgit v1.2.3