From 7528af6d22d3967be9727f6e2d88dfcbf0f78ce9 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Fri, 2 Aug 2013 15:21:56 +0200 Subject: model comparability as an equivalence relation webapp has had a relation hash_functions, that modeled "comparable functions". Images should not be compares to other files, since it makes no sense to store them as the RGBA stream, that is being hashed. This comparability property resembles an equivalence relation. So the function table gains a column eqclass. Each class is represented by a number and functions are statically assigned to these classes. Now the filtering happens in SQL instead of Python. --- webapp.py | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index 260268a..f202c2e 100755 --- a/webapp.py +++ b/webapp.py @@ -12,16 +12,6 @@ from werkzeug.wsgi import SharedDataMiddleware from dedup.utils import fetchiter -hash_functions = [ - ("sha512", "sha512"), - ("png_sha512", "png_sha512"), - ("png_sha512", "gif_sha512"), - ("gif_sha512", "png_sha512"), - ("gif_sha512", "gif_sha512"), - ("gzip_sha512", "gzip_sha512"), - ("sha512", "gzip_sha512"), - ("gzip_sha512", "sha512")] - jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates")) def format_size(size): @@ -135,11 +125,9 @@ class Application(object): def cached_sharedstats(self, pid): cur = self.db.cursor() sharedstats = {} - cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ?;", + cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", (pid,)) for pid2, package2, func1, func2, files, size in fetchiter(cur): - if (func1, func2) not in hash_functions: - continue curstats = sharedstats.setdefault( function_combination(func1, func2), list()) if pid2 == pid: @@ -218,12 +206,11 @@ class Application(object): def show_hash(self, function, hashvalue): cur = self.db.cursor() - cur.execute("SELECT package.name, content.filename, content.size, function.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE hash = ?;", - (hashvalue,)) + cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", + (function, hashvalue,)) entries = [dict(package=package, filename=filename, size=size, function=otherfunc) - for package, filename, size, otherfunc in fetchiter(cur) - if (function, otherfunc) in hash_functions] + for package, filename, size, otherfunc in fetchiter(cur)] if not entries: raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries, -- cgit v1.2.3 From 022985f098a206c3b7852fe08a798cd31623f10d Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Mon, 2 Sep 2013 18:51:20 +0200 Subject: add option -d --database for db path to all scripts --- autoimport.py | 5 ++++- readyaml.py | 8 +++++++- update_sharing.py | 11 ++++++++--- webapp.py | 8 +++++++- 4 files changed, 26 insertions(+), 6 deletions(-) (limited to 'webapp.py') diff --git a/autoimport.py b/autoimport.py index a0681b3..d44c012 100755 --- a/autoimport.py +++ b/autoimport.py @@ -83,9 +83,12 @@ def main(): help="avoid reimporting same versions") parser.add_option("-p", "--prune", action="store_true", help="prune packages old packages") + parser.add_option("-d", "--database", action="store", + default="test.sqlite3", + help="path to the sqlite3 database file") options, args = parser.parse_args() tmpdir = tempfile.mkdtemp(prefix=b"debian-dedup") - db = sqlite3.connect("test.sqlite3") + db = sqlite3.connect(options.database) cur = db.cursor() cur.execute("PRAGMA foreign_keys = ON;") e = concurrent.futures.ThreadPoolExecutor(multiprocessing.cpu_count()) diff --git a/readyaml.py b/readyaml.py index 21b1ca1..2ef9a3b 100755 --- a/readyaml.py +++ b/readyaml.py @@ -2,6 +2,7 @@ """This tool reads a yaml file as generated by importpkg.py on stdin and updates the database with the contents.""" +import optparse import sqlite3 import sys @@ -53,7 +54,12 @@ def readyaml(db, stream): raise ValueError("missing commit block") def main(): - db = sqlite3.connect("test.sqlite3") + parser = optparse.OptionParser() + parser.add_option("-d", "--database", action="store", + default="test.sqlite3", + help="path to the sqlite3 database file") + options, args = parser.parse_args() + db = sqlite3.connect(options.database) readyaml(db, sys.stdin) if __name__ == "__main__": diff --git a/update_sharing.py b/update_sharing.py index 5ec6c7b..1ff0fd8 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -1,5 +1,6 @@ #!/usr/bin/python +import optparse import sqlite3 from dedup.utils import fetchiter @@ -37,8 +38,7 @@ def process_pkgdict(cursor, pkgdict): insert_key = (pid1, pid2, fid1, fid2) add_values(cursor, insert_key, pkgnumfiles, pkgsize) -def main(): - db = sqlite3.connect("test.sqlite3") +def main(db): cur = db.cursor() cur.execute("PRAGMA foreign_keys = ON;") cur.execute("DELETE FROM sharing;") @@ -61,4 +61,9 @@ def main(): db.commit() if __name__ == "__main__": - main() + parser = optparse.OptionParser() + parser.add_option("-d", "--database", action="store", + default="test.sqlite3", + help="path to the sqlite3 database file") + options, args = parser.parse_args() + main(sqlite3.connect(options.database)) diff --git a/webapp.py b/webapp.py index f202c2e..632b485 100755 --- a/webapp.py +++ b/webapp.py @@ -1,6 +1,7 @@ #!/usr/bin/python import datetime +import optparse import sqlite3 from wsgiref.simple_server import make_server @@ -237,7 +238,12 @@ class Application(object): return html_response(source_template.render(params)) def main(): - app = Application(sqlite3.connect("test.sqlite3")) + parser = optparse.OptionParser() + parser.add_option("-d", "--database", action="store", + default="test.sqlite3", + help="path to the sqlite3 database file") + options, args = parser.parse_args() + app = Application(sqlite3.connect(options.database)) app = SharedDataMiddleware(app, {"/": ("dedup", "static")}) make_server("0.0.0.0", 8800, app).serve_forever() -- cgit v1.2.3 From 49cac8bdae0ec787372d227545411ef14905d6a8 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 4 Sep 2013 10:15:59 +0200 Subject: webapp: serve static files from /static --- dedup/templates/base.html | 4 ++-- webapp.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'webapp.py') diff --git a/dedup/templates/base.html b/dedup/templates/base.html index 62f4087..9dfb788 100644 --- a/dedup/templates/base.html +++ b/dedup/templates/base.html @@ -3,8 +3,8 @@ {% block title %}{% endblock %} - - + + {% block header %}{% endblock %} diff --git a/webapp.py b/webapp.py index 632b485..d5f076e 100755 --- a/webapp.py +++ b/webapp.py @@ -244,7 +244,7 @@ def main(): help="path to the sqlite3 database file") options, args = parser.parse_args() app = Application(sqlite3.connect(options.database)) - app = SharedDataMiddleware(app, {"/": ("dedup", "static")}) + app = SharedDataMiddleware(app, {"/static": ("dedup", "static")}) make_server("0.0.0.0", 8800, app).serve_forever() if __name__ == "__main__": -- cgit v1.2.3 From ca65a78a9ace0aeb2565df0da171727c04c33970 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 10 Sep 2013 09:39:40 +0200 Subject: webapp: close database cursors Leaking them can result in running out of available filedescriptors. --- webapp.py | 98 ++++++++++++++++++++++++++++++++------------------------------- 1 file changed, 50 insertions(+), 48 deletions(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index d5f076e..4478ba0 100755 --- a/webapp.py +++ b/webapp.py @@ -1,5 +1,6 @@ #!/usr/bin/python +import contextlib import datetime import optparse import sqlite3 @@ -98,42 +99,43 @@ class Application(object): return e def get_details(self, package): - cur = self.db.cursor() - cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;", - (package,)) - row = cur.fetchone() - if not row: - raise NotFound() - pid, version, architecture = row - details = dict(pid=pid, - package=package, - version=version, - architecture=architecture) - cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;", - (pid,)) - num_files, total_size = cur.fetchone() + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;", + (package,)) + row = cur.fetchone() + if not row: + raise NotFound() + pid, version, architecture = row + details = dict(pid=pid, + package=package, + version=version, + architecture=architecture) + cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;", + (pid,)) + num_files, total_size = cur.fetchone() if total_size is None: total_size = 0 details.update(dict(num_files=num_files, total_size=total_size)) return details def get_dependencies(self, pid): - cur = self.db.cursor() - cur.execute("SELECT required FROM dependency WHERE pid = ?;", - (pid,)) - return set(row[0] for row in fetchiter(cur)) + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT required FROM dependency WHERE pid = ?;", + (pid,)) + return set(row[0] for row in fetchiter(cur)) def cached_sharedstats(self, pid): - cur = self.db.cursor() sharedstats = {} - cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", - (pid,)) - for pid2, package2, func1, func2, files, size in fetchiter(cur): - curstats = sharedstats.setdefault( - function_combination(func1, func2), list()) - if pid2 == pid: - package2 = None - curstats.append(dict(package=package2, duplicate=files, savable=size)) + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", + (pid,)) + for pid2, package2, func1, func2, files, size in fetchiter(cur): + curstats = sharedstats.setdefault( + function_combination(func1, func2), list()) + if pid2 == pid: + package2 = None + curstats.append(dict(package=package2, duplicate=files, + savable=size)) return sharedstats def show_package(self, package): @@ -206,12 +208,12 @@ class Application(object): return html_response(detail_template.stream(params)) def show_hash(self, function, hashvalue): - cur = self.db.cursor() - cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", - (function, hashvalue,)) - entries = [dict(package=package, filename=filename, size=size, - function=otherfunc) - for package, filename, size, otherfunc in fetchiter(cur)] + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", + (function, hashvalue,)) + entries = [dict(package=package, filename=filename, size=size, + function=otherfunc) + for package, filename, size, otherfunc in fetchiter(cur)] if not entries: raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries, @@ -219,21 +221,21 @@ class Application(object): return html_response(hash_template.render(params)) def show_source(self, package): - cur = self.db.cursor() - cur.execute("SELECT name FROM package WHERE source = ?;", - (package,)) - binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) - if not binpkgs: - raise NotFound - cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;", - (package,)) - for binary, otherbin, func1, func2, files, size in fetchiter(cur): - entry = dict(package=otherbin, - funccomb=function_combination(func1, func2), - duplicate=files, savable=size) - oldentry = binpkgs.get(binary) - if not (oldentry and oldentry["savable"] >= size): - binpkgs[binary] = entry + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT name FROM package WHERE source = ?;", + (package,)) + binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) + if not binpkgs: + raise NotFound + cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;", + (package,)) + for binary, otherbin, func1, func2, files, size in fetchiter(cur): + entry = dict(package=otherbin, + funccomb=function_combination(func1, func2), + duplicate=files, savable=size) + oldentry = binpkgs.get(binary) + if not (oldentry and oldentry["savable"] >= size): + binpkgs[binary] = entry params = dict(source=package, packages=binpkgs, urlroot="..") return html_response(source_template.render(params)) -- cgit v1.2.3 From 786c4f93ea318a3c22479f80531594435fb036c3 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 11 Sep 2013 08:35:41 +0200 Subject: webapp: open cursors less often On the main instance opening cursors equals initiating a connection. Unfortunately sqlite3.Connection.close does not close filedescriptors. So just open less cursors to leak filedescriptors less often. --- webapp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index 4478ba0..665ac23 100755 --- a/webapp.py +++ b/webapp.py @@ -166,6 +166,7 @@ class Application(object): cursize = -1 files = dict() minmatch = 2 if pid1 == pid2 else 1 + cur2 = self.db.cursor() for cid, filename, size, hashvalue in fetchiter(cur): if cursize != size: for entry in files.values(): @@ -181,13 +182,12 @@ class Application(object): entry = dict(filenames=set((filename,)), size=size, matches={}) files[hashvalue] = entry - cur2 = self.db.cursor() cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;", (cid, pid2)) for func1, hashvalue, func2, filename in fetchiter(cur2): entry["matches"].setdefault(filename, {})[func1, func2] = \ hashvalue - cur2.close() + cur2.close() cur.close() for entry in files.values(): -- cgit v1.2.3 From b38f14ab3fb72ca1578d7e6bb09178e6fbebba76 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 23 Feb 2014 15:44:03 +0100 Subject: webapp: fix eqclass usage in package comparison When comparing two packages, objects would be considered duplicates without considering whether the respective hash functions are comparable by checking their equivalence classes. The current set of hash functions does not expose this bug. --- webapp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'webapp.py') diff --git a/webapp.py b/webapp.py index 665ac23..fd6d685 100755 --- a/webapp.py +++ b/webapp.py @@ -182,7 +182,7 @@ class Application(object): entry = dict(filenames=set((filename,)), size=size, matches={}) files[hashvalue] = entry - cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;", + cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;", (cid, pid2)) for func1, hashvalue, func2, filename in fetchiter(cur2): entry["matches"].setdefault(filename, {})[func1, func2] = \ -- cgit v1.2.3 From 8d4c5512edbdcdd1063a7e6508f398a5a57981be Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 23 Feb 2014 18:19:35 +0100 Subject: spell check comments --- dedup/hashing.py | 2 +- dedup/image.py | 2 +- importpkg.py | 4 ++-- webapp.py | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'webapp.py') diff --git a/dedup/hashing.py b/dedup/hashing.py index 70f6268..a8a46c7 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -115,7 +115,7 @@ class DecompressedHash(object): class SuppressingHash(object): """A hash that silences exceptions from the update and hexdigest methods of - a hashlib-like object. If an exception has occured, hexdigest always + a hashlib-like object. If an exception has occurred, hexdigest always returns None.""" def __init__(self, hashobj, exceptions=()): """ diff --git a/dedup/image.py b/dedup/image.py index c1f2de0..ef17989 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -4,7 +4,7 @@ import struct import PIL.Image class ImageHash(object): - """A hash on the contents of an image datat type supported by PIL. This + """A hash on the contents of an image data type supported by PIL. This disregards mode, depth and meta information. Note that due to limitations in PIL and the image format (interlacing) the full contents are stored and decoded in hexdigest.""" diff --git a/importpkg.py b/importpkg.py index cb16f97..aeccda5 100755 --- a/importpkg.py +++ b/importpkg.py @@ -1,7 +1,7 @@ #!/usr/bin/python -"""This tool reads a debian package from stdin and emits a yaml stream on +"""This tool reads a Debian package from stdin and emits a yaml stream on stdout. It does not access a database. Therefore it can be run in parallel and -on multiple machines. The generated yaml conatins multiple documents. The first +on multiple machines. The generated yaml contains multiple documents. The first document contains package metadata. Then a document is emitted for each file. And finally a document consisting of the string "commit" is emitted.""" diff --git a/webapp.py b/webapp.py index fd6d685..2fd69bb 100755 --- a/webapp.py +++ b/webapp.py @@ -151,7 +151,7 @@ class Application(object): return html_response(package_template.render(params)) def compute_comparison(self, pid1, pid2): - """Compute a sequence of comparison objects ordery by the size of the + """Compute a sequence of comparison objects ordered by the size of the object in the first package. Each element of the sequence is a dict defining the following keys: * filenames: A set of filenames in package 1 (pid1) all referring to -- cgit v1.2.3