From 6206dea43941560a29c9a1105ae3055740ab80aa Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Mon, 22 Jul 2013 12:03:35 +0200 Subject: schema: extend content_package_index We can avoid a b-tree sort in the package comparison of the web app, if the package index, also provides a size. --- schema.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema.sql b/schema.sql index b839a51..e9e0bcc 100644 --- a/schema.sql +++ b/schema.sql @@ -2,7 +2,7 @@ CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, ar CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); CREATE TABLE hash (cid INTEGER, function TEXT, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE); CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); -CREATE INDEX content_package_index ON content (pid); +CREATE INDEX content_package_size_index ON content (pid, size); CREATE INDEX hash_cid_index ON hash (cid); CREATE INDEX hash_hash_index ON hash (hash); -- cgit v1.2.3 From 6f88561d726327c90f83b8aad1db26abbd4cdf1e Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 23 Jul 2013 18:53:55 +0200 Subject: schema: reference hash functions by integer key This already worked quite well for package.id. On a test data set of 5% size this transformation reduces the database size by about 4%. --- readyaml.py | 2 +- schema.sql | 14 +++++++++++--- update_sharing.py | 16 ++++++++-------- webapp.py | 10 +++++----- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/readyaml.py b/readyaml.py index bb8ac54..f4d6ead 100755 --- a/readyaml.py +++ b/readyaml.py @@ -45,7 +45,7 @@ def readyaml(db, stream): cur.execute("INSERT INTO content (pid, filename, size) VALUES (?, ?, ?);", (pid, entry["name"], entry["size"])) cid = cur.lastrowid - cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);", + cur.executemany("INSERT INTO hash (cid, fid, hash) VALUES (?, (SELECT id FROM function WHERE name = ?), ?);", ((cid, func, hexhash) for func, hexhash in entry["hashes"].items())) raise ValueError("missing commit block") diff --git a/schema.sql b/schema.sql index e9e0bcc..cb6a2c5 100644 --- a/schema.sql +++ b/schema.sql @@ -1,11 +1,19 @@ CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT); CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); -CREATE TABLE hash (cid INTEGER, function TEXT, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE); +CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL); +INSERT INTO function (name) VALUES ("sha512"), ("gzip_sha512"), ("image_sha512"); +CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id)); CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); CREATE INDEX content_package_size_index ON content (pid, size); CREATE INDEX hash_cid_index ON hash (cid); CREATE INDEX hash_hash_index ON hash (hash); -CREATE TABLE sharing (pid1 INTEGER, pid2 INTEGER, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER, FOREIGN KEY (pid1) REFERENCES package(id) ON DELETE CASCADE, FOREIGN KEY (pid2) REFERENCES package(id) ON DELETE CASCADE); -CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, func1, func2); +CREATE TABLE sharing ( + pid1 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + fid1 INTEGER NOT NULL REFERENCES function(id), + fid2 INTEGER NOT NULL REFERENCES function(id), + files INTEGER, + size INTEGER); +CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2); CREATE TABLE duplicate (cid INTEGER PRIMARY KEY, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE); diff --git a/update_sharing.py b/update_sharing.py index 55e8096..e1a2d68 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -5,23 +5,23 @@ import sqlite3 from dedup.utils import fetchiter def add_values(cursor, insert_key, files, size): - cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND func1 = ? AND func2 = ?;", + cursor.execute("UPDATE sharing SET files = files + ?, size = size + ? WHERE pid1 = ? AND pid2 = ? AND fid1 = ? AND fid2 = ?;", (files, size) + insert_key) if cursor.rowcount > 0: return - cursor.execute("INSERT INTO sharing (pid1, pid2, func1, func2, files, size) VALUES (?, ?, ?, ?, ?, ?);", + cursor.execute("INSERT INTO sharing (pid1, pid2, fid1, fid2, files, size) VALUES (?, ?, ?, ?, ?, ?);", insert_key + (files, size)) def compute_pkgdict(rows): pkgdict = dict() - for pid, _, filename, size, function in rows: + for pid, _, filename, size, fid in rows: funcdict = pkgdict.setdefault(pid, {}) - funcdict.setdefault(function, []).append((size, filename)) + funcdict.setdefault(fid, []).append((size, filename)) return pkgdict def process_pkgdict(cursor, pkgdict): for pid1, funcdict1 in pkgdict.items(): - for function1, files in funcdict1.items(): + for fid1, files in funcdict1.items(): numfiles = len(files) size = sum(entry[0] for entry in files) for pid2, funcdict2 in pkgdict.items(): @@ -33,8 +33,8 @@ def process_pkgdict(cursor, pkgdict): else: pkgnumfiles = numfiles pkgsize = size - for function2 in funcdict2.keys(): - insert_key = (pid1, pid2, function1, function2) + for fid2 in funcdict2.keys(): + insert_key = (pid1, pid2, fid1, fid2) add_values(cursor, insert_key, pkgnumfiles, pkgsize) def main(): @@ -46,7 +46,7 @@ def main(): readcur = db.cursor() readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): - cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", + cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", (hashvalue,)) rows = cur.fetchall() print("processing hash %s with %d entries" % (hashvalue, len(rows))) diff --git a/webapp.py b/webapp.py index 9e23128..d42e932 100755 --- a/webapp.py +++ b/webapp.py @@ -275,7 +275,7 @@ class Application(object): def cached_sharedstats(self, pid): cur = self.db.cursor() sharedstats = {} - cur.execute("SELECT pid2, package.name, func1, func2, files, size FROM sharing JOIN package ON sharing.pid2 = package.id WHERE pid1 = ?;", + cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ?;", (pid,)) for pid2, package2, func1, func2, files, size in fetchiter(cur): if (func1, func2) not in hash_functions: @@ -305,7 +305,7 @@ class Application(object): from hash function pairs to hash values. """ cur = self.db.cursor() - cur.execute("SELECT id, filename, size, hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid WHERE pid = ? AND function = 'sha512' ORDER BY size DESC;", + cur.execute("SELECT content.id, content.filename, content.size, hash.hash FROM content JOIN hash ON content.id = hash.cid JOIN duplicate ON content.id = duplicate.cid JOIN function ON hash.fid = function.id WHERE pid = ? AND function.name = 'sha512' ORDER BY size DESC;", (pid1,)) cursize = -1 files = dict() @@ -326,7 +326,7 @@ class Application(object): files[hashvalue] = entry cur2 = self.db.cursor() - cur2.execute("SELECT ha.function, ha.hash, hb.function, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id WHERE ha.cid = ? AND pid = ?;", + cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;", (cid, pid2)) for func1, hashvalue, func2, filename in fetchiter(cur2): entry["matches"].setdefault(filename, {})[func1, func2] = \ @@ -353,7 +353,7 @@ class Application(object): def show_hash(self, function, hashvalue): cur = self.db.cursor() - cur.execute("SELECT package.name, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id WHERE hash = ?;", + cur.execute("SELECT package.name, content.filename, content.size, function.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE hash = ?;", (hashvalue,)) entries = [dict(package=package, filename=filename, size=size, function=otherfunc) @@ -372,7 +372,7 @@ class Application(object): binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) if not binpkgs: raise NotFound - cur.execute("SELECT p1.name, p2.name, sharing.func1, sharing.func2, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id WHERE p1.source = ?;", + cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;", (package,)) for binary, otherbin, func1, func2, files, size in fetchiter(cur): entry = dict(package=otherbin, -- cgit v1.2.3 From 2f797c9b90f05eadf4bb13f4a9c1f029925d9275 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 23 Jul 2013 21:54:41 +0200 Subject: adapt queries in README to new schema --- README | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README b/README index 44b086a..b0e06f3 100644 --- a/README +++ b/README @@ -43,12 +43,12 @@ Finding the 100 largest files shared with multiple packages. Finding those top 100 files that save most space when being reduced to only one copy in the archive. - SELECT hash, sum(size)-min(size), count(*), count(distinct pid) FROM content JOIN hash ON content.id = hash.cid WHERE hash.function = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100; + SELECT hash, sum(size)-min(size), count(*), count(distinct pid) FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100; Finding PNG images that do not carry a .png file extension. - SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id WHERE function = "image_sha512" AND filename NOT LIKE "%.png"; + SELECT package.name, content.filename, content.size FROM content JOIN hash ON content.id = hash.cid JOIN package ON content.pid = package.id JOIN function ON hash.fid = function.id WHERE function.name = "image_sha512" AND filename NOT LIKE "%.png"; Finding .gz files which either are not gziped or contain errors. - SELECT package.name, content.filename FROM content JOIN package ON content.pid = package.id WHERE filename LIKE "%.gz" AND (SELECT count(*) FROM hash WHERE hash.cid = content.id AND hash.function = "gzip_sha512") = 0; + SELECT package.name, content.filename FROM content JOIN package ON content.pid = package.id WHERE filename LIKE "%.gz" AND (SELECT count(*) FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = "gzip_sha512") = 0; -- cgit v1.2.3 From da33f7f323fa9431809e4d93a546e3b234d50406 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 23 Jul 2013 23:26:28 +0200 Subject: webapp: remove unused function --- webapp.py | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/webapp.py b/webapp.py index 9e23128..6def0b6 100755 --- a/webapp.py +++ b/webapp.py @@ -193,27 +193,6 @@ def html_response(unicode_iterator, max_age=24 * 60 * 60): resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age) return resp -def generate_shared(rows): - """internal helper from show_detail""" - entry = None - for filename1, size1, func1, filename2, size2, func2, hashvalue in rows: - funccomb = (func1, func2) - if funccomb not in hash_functions: - continue - if entry and (entry["filename1"] != filename1 or - entry["filename2"] != filename2): - yield entry - entry = None - if entry: - funcdict = entry["functions"] - else: - funcdict = dict() - entry = dict(filename1=filename1, filename2=filename2, size1=size1, - size2=size2, functions=funcdict) - funcdict[funccomb] = hashvalue - if entry: - yield entry - class Application(object): def __init__(self, db): self.db = db -- cgit v1.2.3 From 1e50900862fe8887755597d85483dbc845ccb5e3 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 23 Jul 2013 23:26:52 +0200 Subject: README: fix typo in query --- README | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README b/README index 44b086a..a023d0a 100644 --- a/README +++ b/README @@ -38,7 +38,7 @@ SQL database by hand. Here are some example queries. Finding the 100 largest files shared with multiple packages. - SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON b.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100; + SELECT pa.name, a.filename, pb.name, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid JOIN package AS pa ON a.pid = pa.id JOIN package AS pb ON b.pid = pb.id WHERE (a.pid != b.pid OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100; Finding those top 100 files that save most space when being reduced to only one copy in the archive. -- cgit v1.2.3 From 9e37415086e64d8f623f8236af83a767648611bc Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 23 Jul 2013 23:32:00 +0200 Subject: webapp: make html for index valid --- webapp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp.py b/webapp.py index 6def0b6..b5e0c63 100755 --- a/webapp.py +++ b/webapp.py @@ -151,7 +151,7 @@ index_template = jinjaenv.from_string( Enter binary package to inspect - Note: Non-existing packages will result in 404-Errors
-