From b3d4bbae6fa2810daf3481d8ba99675ffe3bce9e Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 7 Mar 2013 08:24:44 +0100 Subject: integrate the source table into the package table --- importpkg.py | 8 ++------ schema.sql | 3 +-- webapp.py | 4 ++-- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/importpkg.py b/importpkg.py index d63b85e..ef368c2 100755 --- a/importpkg.py +++ b/importpkg.py @@ -148,8 +148,8 @@ def process_package(db, filelike): (package,)) cur.execute("DELETE FROM content WHERE package = ?;", (package,)) - cur.execute("INSERT INTO package (package, version, architecture) VALUES (?, ?, ?);", - (package, version, architecture)) + cur.execute("INSERT INTO package (package, version, architecture, source) VALUES (?, ?, ?, ?);", + (package, version, architecture, source)) depends = control.relations.get("depends", []) depends = set(dep[0]["name"].encode("ascii") for dep in depends if len(dep) == 1) @@ -157,10 +157,6 @@ def process_package(db, filelike): (package,)) cur.executemany("INSERT INTO dependency (package, required) VALUES (?, ?);", ((package, dep) for dep in depends)) - cur.execute("DELETE FROM source WHERE package = ?;", - (package,)) - cur.execute("INSERT INTO source (source, package) VALUES (?, ?);", - (source, package)) break continue elif name == "data.tar.gz": diff --git a/schema.sql b/schema.sql index 878a5fe..954b359 100644 --- a/schema.sql +++ b/schema.sql @@ -1,7 +1,6 @@ -CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT); +CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT, source TEXT); CREATE TABLE content (package TEXT, filename TEXT, size INTEGER, function TEXT, hash TEXT, FOREIGN KEY (package) REFERENCES package(package)); CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package), FOREIGN KEY (required) REFERENCES package(package)); -CREATE TABLE source (source TEXT, package TEXT, FOREIGN KEY (package) REFERENCES package(package)); CREATE INDEX content_package_index ON content (package); CREATE INDEX content_hash_index ON content (hash); diff --git a/webapp.py b/webapp.py index 6c06e28..0e4dc15 100755 --- a/webapp.py +++ b/webapp.py @@ -334,12 +334,12 @@ class Application(object): def show_source(self, package): cur = self.db.cursor() - cur.execute("SELECT package FROM source WHERE source = ?;", + cur.execute("SELECT package FROM package WHERE source = ?;", (package,)) binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) if not binpkgs: raise NotFound - cur.execute("SELECT source.package, sharing.package2, sharing.func1, sharing.func2, sharing.files, sharing.size FROM source JOIN sharing ON source.package = sharing.package1 WHERE source.source = ?;", + cur.execute("SELECT package.package, sharing.package2, sharing.func1, sharing.func2, sharing.files, sharing.size FROM package JOIN sharing ON package.package = sharing.package1 WHERE package.source = ?;", (package,)) for binary, otherbin, func1, func2, files, size in fetchiter(cur): entry = dict(package=otherbin, -- cgit v1.2.3 From 827daf10eb510a00f9ca710a001f962524686520 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 7 Mar 2013 08:28:56 +0100 Subject: schema.sql: annotat foreign keys of sharing --- schema.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema.sql b/schema.sql index 954b359..d0a27cf 100644 --- a/schema.sql +++ b/schema.sql @@ -4,6 +4,6 @@ CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFE CREATE INDEX content_package_index ON content (package); CREATE INDEX content_hash_index ON content (hash); -CREATE TABLE sharing (package1 TEXT, package2 TEXT, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER); +CREATE TABLE sharing (package1 TEXT, package2 TEXT, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER, FOREIGN KEY (package1) REFERENCES package(package), FOREIGN KEY (package2) REFERENCES package(package)); CREATE INDEX sharing_insert_index ON sharing (package1, package2, func1, func2); CREATE INDEX sharing_package_index ON sharing (package1); -- cgit v1.2.3 From d6638c18872db382a8d1045b6de1f9438c8a25fa Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 7 Mar 2013 08:41:35 +0100 Subject: schema.sql: remove unsatisfiable foreign key In the dependency table we will insert dependencies on packages which are not tracked. This happens during initial import and for virtual packages. Therefore the "required" column cannot be a foreign key. --- schema.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/schema.sql b/schema.sql index d0a27cf..72357e3 100644 --- a/schema.sql +++ b/schema.sql @@ -1,6 +1,6 @@ CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT, source TEXT); CREATE TABLE content (package TEXT, filename TEXT, size INTEGER, function TEXT, hash TEXT, FOREIGN KEY (package) REFERENCES package(package)); -CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package), FOREIGN KEY (required) REFERENCES package(package)); +CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package)); CREATE INDEX content_package_index ON content (package); CREATE INDEX content_hash_index ON content (hash); -- cgit v1.2.3 From ac4c52c7a4501e90ed53f3ca5780bec4551a8c9f Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 7 Mar 2013 08:43:15 +0100 Subject: enable enforcing foreign keys --- autoimport.py | 6 +++++- importpkg.py | 5 ++--- update_sharing.py | 1 + 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/autoimport.py b/autoimport.py index a7ea5ff..bc2c6aa 100755 --- a/autoimport.py +++ b/autoimport.py @@ -14,6 +14,7 @@ def main(): urlbase = sys.argv[1] db = sqlite3.connect("test.sqlite3") cur = db.cursor() + cur.execute("PRAGMA foreign_keys = ON;") cur.execute("SELECT package, version FROM package;") knownpkgs = dict((row[0], row[1]) for row in cur.fetchall()) @@ -37,7 +38,10 @@ def main(): delpkgs = set(knownpkgs) - distpkgs print("clearing packages %s" % " ".join(delpkgs)) - cur.execute("PRAGMA foreign_keys=1;") + cur.executemany("DELETE FROM sharing WHERE package1 = ?", + ((pkg,) for pkg in delpkgs)) + cur.executemany("DELETE FROM sharing WHERE package2 = ?", + ((pkg,) for pkg in delpkgs)) cur.executemany("DELETE FROM content WHERE package = ?;", ((pkg,) for pkg in delpkgs)) cur.executemany("DELETE FROM dependency WHERE package = ?;", diff --git a/importpkg.py b/importpkg.py index ef368c2..aae9a7f 100755 --- a/importpkg.py +++ b/importpkg.py @@ -109,6 +109,7 @@ def get_hashes(tar): def process_package(db, filelike): cur = db.cursor() + cur.execute("PRAGMA foreign_keys = ON;") af = ArReader(filelike) af.read_magic() state = "start" @@ -144,11 +145,9 @@ def process_package(db, filelike): if row and version_compare(row[0], version) > 0: return # already seen a newer package - cur.execute("DELETE FROM package WHERE package = ?;", - (package,)) cur.execute("DELETE FROM content WHERE package = ?;", (package,)) - cur.execute("INSERT INTO package (package, version, architecture, source) VALUES (?, ?, ?, ?);", + cur.execute("INSERT OR REPLACE INTO package (package, version, architecture, source) VALUES (?, ?, ?, ?);", (package, version, architecture, source)) depends = control.relations.get("depends", []) depends = set(dep[0]["name"].encode("ascii") diff --git a/update_sharing.py b/update_sharing.py index dd12e5b..2ed532b 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -40,6 +40,7 @@ def process_pkgdict(cursor, pkgdict): def main(): db = sqlite3.connect("test.sqlite3") cur = db.cursor() + cur.execute("PRAGMA foreign_keys = ON;") cur.execute("DELETE FROM sharing;") readcur = db.cursor() readcur.execute("SELECT hash FROM content GROUP BY hash HAVING count(*) > 1;") -- cgit v1.2.3 From e0dd701f41a874c1c06702d849fe4b860a450e3e Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 7 Mar 2013 09:05:48 +0100 Subject: use "ON DELETE CASCADE" clauses --- autoimport.py | 10 ++-------- schema.sql | 6 +++--- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/autoimport.py b/autoimport.py index bc2c6aa..453a839 100755 --- a/autoimport.py +++ b/autoimport.py @@ -38,16 +38,10 @@ def main(): delpkgs = set(knownpkgs) - distpkgs print("clearing packages %s" % " ".join(delpkgs)) - cur.executemany("DELETE FROM sharing WHERE package1 = ?", - ((pkg,) for pkg in delpkgs)) - cur.executemany("DELETE FROM sharing WHERE package2 = ?", - ((pkg,) for pkg in delpkgs)) - cur.executemany("DELETE FROM content WHERE package = ?;", - ((pkg,) for pkg in delpkgs)) - cur.executemany("DELETE FROM dependency WHERE package = ?;", - ((pkg,) for pkg in delpkgs)) cur.executemany("DELETE FROM package WHERE package = ?;", ((pkg,) for pkg in delpkgs)) + # Tables content, dependency and sharing will also be pruned + # due to ON DELETE CASCADE clauses. db.commit() if __name__ == "__main__": diff --git a/schema.sql b/schema.sql index 72357e3..c329465 100644 --- a/schema.sql +++ b/schema.sql @@ -1,9 +1,9 @@ CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT, source TEXT); -CREATE TABLE content (package TEXT, filename TEXT, size INTEGER, function TEXT, hash TEXT, FOREIGN KEY (package) REFERENCES package(package)); -CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package)); +CREATE TABLE content (package TEXT, filename TEXT, size INTEGER, function TEXT, hash TEXT, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE); +CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE); CREATE INDEX content_package_index ON content (package); CREATE INDEX content_hash_index ON content (hash); -CREATE TABLE sharing (package1 TEXT, package2 TEXT, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER, FOREIGN KEY (package1) REFERENCES package(package), FOREIGN KEY (package2) REFERENCES package(package)); +CREATE TABLE sharing (package1 TEXT, package2 TEXT, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER, FOREIGN KEY (package1) REFERENCES package(package) ON DELETE CASCADE, FOREIGN KEY (package2) REFERENCES package(package) ON DELETE CASCADE); CREATE INDEX sharing_insert_index ON sharing (package1, package2, func1, func2); CREATE INDEX sharing_package_index ON sharing (package1); -- cgit v1.2.3 From 423ceee0d0befc8755a9ae915d15e8d415d98159 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sat, 9 Mar 2013 18:37:24 +0100 Subject: webapp: drop unused function compute_sharedstats The sharing table works great and I don't want to adapt it for the next step in the schema change. --- webapp.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/webapp.py b/webapp.py index 0e4dc15..f80b3da 100755 --- a/webapp.py +++ b/webapp.py @@ -253,30 +253,6 @@ class Application(object): (package,)) return set(row[0] for row in fetchiter(cur)) - def compute_sharedstats(self, package): - cur = self.db.cursor() - sharedstats = {} - for func1, func2 in hash_functions: - cur.execute("SELECT a.filename, a.hash, a.size, b.package FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND a.function = ? AND b.function = ? AND (a.filename != b.filename OR b.package != ?);", - (package, func1, func2, package)) - sharing = dict() - for afile, hashval, size, bpkg in fetchiter(cur): - hashdict = sharing.setdefault(bpkg, dict()) - fileset = hashdict.setdefault(hashval, (size, set()))[1] - fileset.add(afile) - if sharing: - sharedstats[function_combination(func1, func2)] = curstats = [] - mapping = sharing.pop(package, dict()) - if mapping: - duplicate = sum(len(files) for _, files in mapping.values()) - savable = sum(size * (len(files) - 1) for size, files in mapping.values()) - curstats.append(dict(package=None, duplicate=duplicate, savable=savable)) - for pkg, mapping in sharing.items(): - duplicate = sum(len(files) for _, files in mapping.values()) - savable = sum(size * len(files) for size, files in mapping.values()) - curstats.append(dict(package=pkg, duplicate=duplicate, savable=savable)) - return sharedstats - def cached_sharedstats(self, package): cur = self.db.cursor() sharedstats = {} -- cgit v1.2.3 From 5c0dcba3229b8c3e0faf42cf3e07cb82ee1369cd Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sat, 9 Mar 2013 18:43:47 +0100 Subject: split content table to a hash table In the old content table (package, filename, size) would be the same for multiple hash functions. Now the schema represents that each file has precisely one size, but multiple hashes. --- importpkg.py | 13 +++++++++---- schema.sql | 6 ++++-- update_sharing.py | 4 ++-- webapp.py | 6 +++--- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/importpkg.py b/importpkg.py index aae9a7f..5d6a58c 100755 --- a/importpkg.py +++ b/importpkg.py @@ -102,10 +102,12 @@ def get_hashes(tar): continue hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash()) hasher = hash_file(hasher, tar.extractfile(elem)) + hashes = {} for hashobj in hasher.hashes: hashvalue = hashobj.hexdigest() if hashvalue: - yield (elem.name, elem.size, hashobj.name, hashvalue) + hashes[hashobj.name] = hashvalue + yield (elem.name, elem.size, hashes) def process_package(db, filelike): cur = db.cursor() @@ -169,14 +171,17 @@ def process_package(db, filelike): continue if state != "control_file": raise ValueError("missing control file") - for name, size, function, hexhash in get_hashes(tf): + for name, size, hashes in get_hashes(tf): try: name = name.decode("utf8") except UnicodeDecodeError: print("warning: skipping filename with encoding error") continue # skip files with non-utf8 encoding for now - cur.execute("INSERT INTO content (package, filename, size, function, hash) VALUES (?, ?, ?, ?, ?);", - (package, name, size, function, hexhash)) + cur.execute("INSERT INTO content (package, filename, size) VALUES (?, ?, ?);", + (package, name, size)) + cid = cur.lastrowid + cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);", + ((cid, func, hexhash) for func, hexhash in hashes.items())) db.commit() return raise ValueError("data.tar not found") diff --git a/schema.sql b/schema.sql index c329465..a67c807 100644 --- a/schema.sql +++ b/schema.sql @@ -1,8 +1,10 @@ CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT, source TEXT); -CREATE TABLE content (package TEXT, filename TEXT, size INTEGER, function TEXT, hash TEXT, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE); +CREATE TABLE content (id INTEGER PRIMARY KEY, package TEXT, filename TEXT, size INTEGER, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE); +CREATE TABLE hash (cid INTEGER, function TEXT, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE); CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE); CREATE INDEX content_package_index ON content (package); -CREATE INDEX content_hash_index ON content (hash); +CREATE INDEX hash_cid_index ON hash (cid); +CREATE INDEX hash_hash_index ON hash (hash); CREATE TABLE sharing (package1 TEXT, package2 TEXT, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER, FOREIGN KEY (package1) REFERENCES package(package) ON DELETE CASCADE, FOREIGN KEY (package2) REFERENCES package(package) ON DELETE CASCADE); CREATE INDEX sharing_insert_index ON sharing (package1, package2, func1, func2); diff --git a/update_sharing.py b/update_sharing.py index 2ed532b..b45e40b 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -43,9 +43,9 @@ def main(): cur.execute("PRAGMA foreign_keys = ON;") cur.execute("DELETE FROM sharing;") readcur = db.cursor() - readcur.execute("SELECT hash FROM content GROUP BY hash HAVING count(*) > 1;") + readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): - cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;", + cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", (hashvalue,)) rows = cur.fetchall() print("processing hash %s with %d entries" % (hashvalue, len(rows))) diff --git a/webapp.py b/webapp.py index f80b3da..1da987b 100755 --- a/webapp.py +++ b/webapp.py @@ -279,13 +279,13 @@ class Application(object): if package1 == package2: details1 = details2 = self.get_details(package1) - cur.execute("SELECT a.filename, a.size, a.function, b.filename, b.size, b.function, a.hash FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND b.package = ? AND a.filename != b.filename ORDER BY a.size DESC, a.filename, b.filename;", + cur.execute("SELECT a.filename, a.size, ha.function, b.filename, b.size, hb.function, ha.hash FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE a.package = ? AND b.package = ? AND a.filename != b.filename ORDER BY a.size DESC, a.filename, b.filename;", (package1, package1)) else: details1 = self.get_details(package1) details2 = self.get_details(package2) - cur.execute("SELECT a.filename, a.size, a.function, b.filename, b.size, b.function, a.hash FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND b.package = ? ORDER BY a.size DESC, a.filename, b.filename;", + cur.execute("SELECT a.filename, a.size, ha.function, b.filename, b.size, hb.function, ha.hash FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE a.package = ? AND b.package = ? ORDER BY a.size DESC, a.filename, b.filename;", (package1, package2)) shared = generate_shared(fetchiter(cur)) # The cursor will be in use until the template is fully rendered. @@ -297,7 +297,7 @@ class Application(object): def show_hash(self, function, hashvalue): cur = self.db.cursor() - cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;", + cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM content JOIN hash ON content.id = hash.cid WHERE hash = ?;", (hashvalue,)) entries = [dict(package=package, filename=filename, size=size, function=otherfunc) -- cgit v1.2.3 From 4326c4766a6e1fb30bdb6999922ad0620543fff0 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 10 Mar 2013 07:38:22 +0100 Subject: README: update queries to match content table split --- README | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README b/README index bc1b715..55a8eb6 100644 --- a/README +++ b/README @@ -30,17 +30,17 @@ SQL database by hand. Here are some example queries. Finding the 100 largest files shared with multiple packages. - SELECT a.package, a.filename, b.package, b.filename, a.size FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE (a.package != b.package OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100; + SELECT a.package, a.filename, b.package, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE (a.package != b.package OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100; Finding those top 100 files that save most space when being reduced to only one copy in the archive. - SELECT hash, sum(size)-min(size), count(*), count(distinct package) FROM content GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100; + SELECT hash, sum(size)-min(size), count(*), count(distinct package) FROM content JOIN hash ON content.id = hash.cid WHERE hash.function = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100; Finding PNG images that do not carry a .png file extension. - SELECT package, filename, size FROM content WHERE function = "image_sha512" AND filename NOT LIKE "%.png"; + SELECT package, filename, size FROM content JOIN hash ON content.id = hash.cid WHERE function = "image_sha512" AND filename NOT LIKE "%.png"; Finding .gz files which either are not gziped or contain errors. - SELECT package, filename FROM content AS a WHERE function = "sha512" AND filename LIKE "%.gz" AND (SELECT count(*) FROM content AS b WHERE b.package = a.package AND b.filename = a.filename AND b.function = "gzip_sha512") = 0; + SELECT a.package, a.filename FROM content AS a JOIN hash AS ha ON a.id = ha.cid WHERE ha.function = "sha512" AND a.filename LIKE "%.gz" AND (SELECT count(*) FROM content AS b JOIN hash AS hb ON b.id = hb.cid WHERE b.package = a.package AND b.filename = a.filename AND hb.function = "gzip_sha512") = 0; -- cgit v1.2.3