summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHelmut Grohne <helmut@subdivi.de>2013-08-02 15:21:56 +0200
committerHelmut Grohne <helmut@subdivi.de>2013-08-02 15:21:56 +0200
commit7528af6d22d3967be9727f6e2d88dfcbf0f78ce9 (patch)
tree40f16ad103c89240cc68f6eaa35d278af42fb9ae
parent2712edb550968ce7ec8cd9800241d7944666631a (diff)
downloaddebian-dedup-7528af6d22d3967be9727f6e2d88dfcbf0f78ce9.tar.gz
model comparability as an equivalence relation
webapp has had a relation hash_functions, that modeled "comparable functions". Images should not be compares to other files, since it makes no sense to store them as the RGBA stream, that is being hashed. This comparability property resembles an equivalence relation. So the function table gains a column eqclass. Each class is represented by a number and functions are statically assigned to these classes. Now the filtering happens in SQL instead of Python.
-rw-r--r--schema.sql4
-rwxr-xr-xwebapp.py21
2 files changed, 6 insertions, 19 deletions
diff --git a/schema.sql b/schema.sql
index ddc6ccd..2ab7ca7 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,7 +1,7 @@
CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT);
CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
-CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL);
-INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("png_sha512"), ("gif_sha512");
+CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER);
+INSERT INTO function (name, eqclass) VALUES ("sha512", 1), ("gzip_sha512", 1), ("png_sha512", 2), ("gif_sha512", 2);
CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id));
CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
CREATE INDEX content_package_size_index ON content (pid, size);
diff --git a/webapp.py b/webapp.py
index 260268a..f202c2e 100755
--- a/webapp.py
+++ b/webapp.py
@@ -12,16 +12,6 @@ from werkzeug.wsgi import SharedDataMiddleware
from dedup.utils import fetchiter
-hash_functions = [
- ("sha512", "sha512"),
- ("png_sha512", "png_sha512"),
- ("png_sha512", "gif_sha512"),
- ("gif_sha512", "png_sha512"),
- ("gif_sha512", "gif_sha512"),
- ("gzip_sha512", "gzip_sha512"),
- ("sha512", "gzip_sha512"),
- ("gzip_sha512", "sha512")]
-
jinjaenv = jinja2.Environment(loader=jinja2.PackageLoader("dedup", "templates"))
def format_size(size):
@@ -135,11 +125,9 @@ class Application(object):
def cached_sharedstats(self, pid):
cur = self.db.cursor()
sharedstats = {}
- cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ?;",
+ cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
(pid,))
for pid2, package2, func1, func2, files, size in fetchiter(cur):
- if (func1, func2) not in hash_functions:
- continue
curstats = sharedstats.setdefault(
function_combination(func1, func2), list())
if pid2 == pid:
@@ -218,12 +206,11 @@ class Application(object):
def show_hash(self, function, hashvalue):
cur = self.db.cursor()
- cur.execute("SELECT package.name, content.filename, content.size, function.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE hash = ?;",
- (hashvalue,))
+ cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
+ (function, hashvalue,))
entries = [dict(package=package, filename=filename, size=size,
function=otherfunc)
- for package, filename, size, otherfunc in fetchiter(cur)
- if (function, otherfunc) in hash_functions]
+ for package, filename, size, otherfunc in fetchiter(cur)]
if not entries:
raise NotFound()
params = dict(function=function, hashvalue=hashvalue, entries=entries,