diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-08-02 15:21:56 +0200 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-08-02 15:21:56 +0200 |
commit | 7528af6d22d3967be9727f6e2d88dfcbf0f78ce9 (patch) | |
tree | 40f16ad103c89240cc68f6eaa35d278af42fb9ae | |
parent | 2712edb550968ce7ec8cd9800241d7944666631a (diff) | |
download | debian-dedup-7528af6d22d3967be9727f6e2d88dfcbf0f78ce9.tar.gz |
model comparability as an equivalence relation
webapp has had a relation hash_functions, that modeled "comparable
functions". Images should not be compares to other files, since it makes
no sense to store them as the RGBA stream, that is being hashed. This
comparability property resembles an equivalence relation. So the
function table gains a column eqclass. Each class is represented by a
number and functions are statically assigned to these classes. Now the
filtering happens in SQL instead of Python.
-rw-r--r-- | schema.sql | 4 | ||||
-rwxr-xr-x | webapp.py | 21 |
2 files changed, 6 insertions, 19 deletions
@@ -1,7 +1,7 @@ CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT); CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); -CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL); -INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("png_sha512"), ("gif_sha512"); +CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER); +INSERT INTO function (name, eqclass) VALUES ("sha512", 1), ("gzip_sha512", 1), ("png_sha512", 2), ("gif_sha512", 2); CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id)); CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); CREATE INDEX content_package_size_index ON content (pid, size); @@ -12,16 +12,6 @@ from werkzeug.wsgi import SharedDataMiddleware from dedup.utils import fetchiter -hash_functions = [ - ("sha512", "sha512"), - ("png_sha512", "png_sha512"), - ("png_sha512", "gif_sha512"), - ("gif_sha512", "png_sha512"), - ("gif_sha512", "gif_sha512"), - ("gzip_sha512", "gzip_sha512"), - ("sha512", "gzip_sha512"), - ("gzip_sha512", "sha512")] - jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates")) def format_size(size): @@ -135,11 +125,9 @@ class Application(object): def cached_sharedstats(self, pid): cur = self.db.cursor() sharedstats = {} - cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ?;", + cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", (pid,)) for pid2, package2, func1, func2, files, size in fetchiter(cur): - if (func1, func2) not in hash_functions: - continue curstats = sharedstats.setdefault( function_combination(func1, func2), list()) if pid2 == pid: @@ -218,12 +206,11 @@ class Application(object): def show_hash(self, function, hashvalue): cur = self.db.cursor() - cur.execute("SELECT package.name, content.filename, content.size, function.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE hash = ?;", - (hashvalue,)) + cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", + (function, hashvalue,)) entries = [dict(package=package, filename=filename, size=size, function=otherfunc) - for package, filename, size, otherfunc in fetchiter(cur) - if (function, otherfunc) in hash_functions] + for package, filename, size, otherfunc in fetchiter(cur)] if not entries: raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries, |