From ca65a78a9ace0aeb2565df0da171727c04c33970 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 10 Sep 2013 09:39:40 +0200 Subject: webapp: close database cursors Leaking them can result in running out of available filedescriptors. --- webapp.py | 98 ++++++++++++++++++++++++++++++++------------------------------- 1 file changed, 50 insertions(+), 48 deletions(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index d5f076e..4478ba0 100755 --- a/webapp.py +++ b/webapp.py @@ -1,5 +1,6 @@ #!/usr/bin/python +import contextlib import datetime import optparse import sqlite3 @@ -98,42 +99,43 @@ class Application(object): return e def get_details(self, package): - cur = self.db.cursor() - cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;", - (package,)) - row = cur.fetchone() - if not row: - raise NotFound() - pid, version, architecture = row - details = dict(pid=pid, - package=package, - version=version, - architecture=architecture) - cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;", - (pid,)) - num_files, total_size = cur.fetchone() + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;", + (package,)) + row = cur.fetchone() + if not row: + raise NotFound() + pid, version, architecture = row + details = dict(pid=pid, + package=package, + version=version, + architecture=architecture) + cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;", + (pid,)) + num_files, total_size = cur.fetchone() if total_size is None: total_size = 0 details.update(dict(num_files=num_files, total_size=total_size)) return details def get_dependencies(self, pid): - cur = self.db.cursor() - cur.execute("SELECT required FROM dependency WHERE pid = ?;", - (pid,)) - return set(row[0] for row in fetchiter(cur)) + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT required FROM dependency WHERE pid = ?;", + (pid,)) + return set(row[0] for row in fetchiter(cur)) def cached_sharedstats(self, pid): - cur = self.db.cursor() sharedstats = {} - cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", - (pid,)) - for pid2, package2, func1, func2, files, size in fetchiter(cur): - curstats = sharedstats.setdefault( - function_combination(func1, func2), list()) - if pid2 == pid: - package2 = None - curstats.append(dict(package=package2, duplicate=files, savable=size)) + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", + (pid,)) + for pid2, package2, func1, func2, files, size in fetchiter(cur): + curstats = sharedstats.setdefault( + function_combination(func1, func2), list()) + if pid2 == pid: + package2 = None + curstats.append(dict(package=package2, duplicate=files, + savable=size)) return sharedstats def show_package(self, package): @@ -206,12 +208,12 @@ class Application(object): return html_response(detail_template.stream(params)) def show_hash(self, function, hashvalue): - cur = self.db.cursor() - cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", - (function, hashvalue,)) - entries = [dict(package=package, filename=filename, size=size, - function=otherfunc) - for package, filename, size, otherfunc in fetchiter(cur)] + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", + (function, hashvalue,)) + entries = [dict(package=package, filename=filename, size=size, + function=otherfunc) + for package, filename, size, otherfunc in fetchiter(cur)] if not entries: raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries, @@ -219,21 +221,21 @@ class Application(object): return html_response(hash_template.render(params)) def show_source(self, package): - cur = self.db.cursor() - cur.execute("SELECT name FROM package WHERE source = ?;", - (package,)) - binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) - if not binpkgs: - raise NotFound - cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;", - (package,)) - for binary, otherbin, func1, func2, files, size in fetchiter(cur): - entry = dict(package=otherbin, - funccomb=function_combination(func1, func2), - duplicate=files, savable=size) - oldentry = binpkgs.get(binary) - if not (oldentry and oldentry["savable"] >= size): - binpkgs[binary] = entry + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT name FROM package WHERE source = ?;", + (package,)) + binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) + if not binpkgs: + raise NotFound + cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;", + (package,)) + for binary, otherbin, func1, func2, files, size in fetchiter(cur): + entry = dict(package=otherbin, + funccomb=function_combination(func1, func2), + duplicate=files, savable=size) + oldentry = binpkgs.get(binary) + if not (oldentry and oldentry["savable"] >= size): + binpkgs[binary] = entry params = dict(source=package, packages=binpkgs, urlroot="..") return html_response(source_template.render(params)) -- cgit v1.2.3 From 786c4f93ea318a3c22479f80531594435fb036c3 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 11 Sep 2013 08:35:41 +0200 Subject: webapp: open cursors less often On the main instance opening cursors equals initiating a connection. Unfortunately sqlite3.Connection.close does not close filedescriptors. So just open less cursors to leak filedescriptors less often. --- webapp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index 4478ba0..665ac23 100755 --- a/webapp.py +++ b/webapp.py @@ -166,6 +166,7 @@ class Application(object): cursize = -1 files = dict() minmatch = 2 if pid1 == pid2 else 1 + cur2 = self.db.cursor() for cid, filename, size, hashvalue in fetchiter(cur): if cursize != size: for entry in files.values(): @@ -181,13 +182,12 @@ class Application(object): entry = dict(filenames=set((filename,)), size=size, matches={}) files[hashvalue] = entry - cur2 = self.db.cursor() cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;", (cid, pid2)) for func1, hashvalue, func2, filename in fetchiter(cur2): entry["matches"].setdefault(filename, {})[func1, func2] = \ hashvalue - cur2.close() + cur2.close() cur.close() for entry in files.values(): -- cgit v1.2.3 From b38f14ab3fb72ca1578d7e6bb09178e6fbebba76 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 23 Feb 2014 15:44:03 +0100 Subject: webapp: fix eqclass usage in package comparison When comparing two packages, objects would be considered duplicates without considering whether the respective hash functions are comparable by checking their equivalence classes. The current set of hash functions does not expose this bug. --- webapp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index 665ac23..fd6d685 100755 --- a/webapp.py +++ b/webapp.py @@ -182,7 +182,7 @@ class Application(object): entry = dict(filenames=set((filename,)), size=size, matches={}) files[hashvalue] = entry - cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;", + cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;", (cid, pid2)) for func1, hashvalue, func2, filename in fetchiter(cur2): entry["matches"].setdefault(filename, {})[func1, func2] = \ -- cgit v1.2.3 From 8d4c5512edbdcdd1063a7e6508f398a5a57981be Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 23 Feb 2014 18:19:35 +0100 Subject: spell check comments --- dedup/hashing.py | 2 +- dedup/image.py | 2 +- importpkg.py | 4 ++-- webapp.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'webapp.py') diff --git a/dedup/hashing.py b/dedup/hashing.py index 70f6268..a8a46c7 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -115,7 +115,7 @@ class DecompressedHash(object): class SuppressingHash(object): """A hash that silences exceptions from the update and hexdigest methods of - a hashlib-like object. If an exception has occured, hexdigest always + a hashlib-like object. If an exception has occurred, hexdigest always returns None.""" def __init__(self, hashobj, exceptions=()): """ diff --git a/dedup/image.py b/dedup/image.py index c1f2de0..ef17989 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -4,7 +4,7 @@ import struct import PIL.Image class ImageHash(object): - """A hash on the contents of an image datat type supported by PIL. This + """A hash on the contents of an image data type supported by PIL. This disregards mode, depth and meta information. Note that due to limitations in PIL and the image format (interlacing) the full contents are stored and decoded in hexdigest.""" diff --git a/importpkg.py b/importpkg.py index cb16f97..aeccda5 100755 --- a/importpkg.py +++ b/importpkg.py @@ -1,7 +1,7 @@ #!/usr/bin/python -"""This tool reads a debian package from stdin and emits a yaml stream on +"""This tool reads a Debian package from stdin and emits a yaml stream on stdout. It does not access a database. Therefore it can be run in parallel and -on multiple machines. The generated yaml conatins multiple documents. The first +on multiple machines. The generated yaml contains multiple documents. The first document contains package metadata. Then a document is emitted for each file. And finally a document consisting of the string "commit" is emitted.""" diff --git a/webapp.py b/webapp.py index fd6d685..2fd69bb 100755 --- a/webapp.py +++ b/webapp.py @@ -151,7 +151,7 @@ class Application(object): return html_response(package_template.render(params)) def compute_comparison(self, pid1, pid2): - """Compute a sequence of comparison objects ordery by the size of the + """Compute a sequence of comparison objects ordered by the size of the object in the first package. Each element of the sequence is a dict defining the following keys: * filenames: A set of filenames in package 1 (pid1) all referring to -- cgit v1.2.3 From 6a62d49350d44176de08afa980a47017007f4a3c Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 11 May 2014 15:25:46 +0200 Subject: webapp: allow git-like hash truncation --- webapp.py | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index 2fd69bb..9612c38 100755 --- a/webapp.py +++ b/webapp.py @@ -8,7 +8,8 @@ from wsgiref.simple_server import make_server import jinja2 from werkzeug.exceptions import HTTPException, NotFound -from werkzeug.routing import Map, Rule, RequestRedirect +from werkzeug.routing import Map, Rule +from werkzeug.utils import redirect from werkzeug.wrappers import Request, Response from werkzeug.wsgi import SharedDataMiddleware @@ -61,6 +62,12 @@ def html_response(unicode_iterator, max_age=24 * 60 * 60): resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age) return resp +class InternalRedirect(Exception): + def __init__(self, target, code=301): + Exception.__init__(self) + self.target = target + self.code = code + class Application(object): def __init__(self, db): self.db = db @@ -84,17 +91,18 @@ class Application(object): elif endpoint == "hash": if args["function"] == "image_sha512": # backwards compatibility - raise RequestRedirect("%s/hash/png_sha512/%s" % - (request.environ["SCRIPT_NAME"], - args["hashvalue"])) + raise InternalRedirect("/hash/png_sha512/%s" % + args["hashvalue"]) return self.show_hash(args["function"], args["hashvalue"]) elif endpoint == "index": if not request.environ["PATH_INFO"]: - raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/") + raise InternalRedirect("/") return html_response(index_template.render(dict(urlroot=""))) elif endpoint == "source": return self.show_source(args["package"]) raise NotFound() + except InternalRedirect as r: + return redirect(request.environ["SCRIPT_NAME"] + r.target, r.code) except HTTPException as e: return e @@ -214,8 +222,16 @@ class Application(object): entries = [dict(package=package, filename=filename, size=size, function=otherfunc) for package, filename, size, otherfunc in fetchiter(cur)] - if not entries: - raise NotFound() + if not entries: + # Assumption: '~' serves as an infinite character larger than + # any other character in the hash column. + cur.execute("SELECT DISTINCT hash.hash FROM hash JOIN function ON hash.fid = function.id WHERE function.name = ? AND hash.hash >= ? AND hash.hash <= ? LIMIT 2;", + (function, hashvalue, hashvalue + '~')) + values = cur.fetchall() + if len(values) == 1: + raise InternalRedirect("/hash/%s/%s" % + (function, values[0][0]), 302) + raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries, urlroot="../..") return html_response(hash_template.render(params)) -- cgit v1.2.3