diff options
author | Helmut Grohne <helmut@subdivi.de> | 2013-03-26 15:59:48 +0100 |
---|---|---|
committer | Helmut Grohne <helmut@subdivi.de> | 2013-03-26 15:59:48 +0100 |
commit | 88b0d1ed0809922b56f30e1ee99bc6db67af7f96 (patch) | |
tree | b736dde6602dd0006cbb0efe490d28c42cb2dc3d | |
parent | 68cf124ab5fba5f96364925e09226db5c2bbf028 (diff) | |
parent | 4326c4766a6e1fb30bdb6999922ad0620543fff0 (diff) | |
download | debian-dedup-88b0d1ed0809922b56f30e1ee99bc6db67af7f96.tar.gz |
Merge branch schemachange
-rw-r--r-- | README | 8 | ||||
-rwxr-xr-x | autoimport.py | 8 | ||||
-rwxr-xr-x | importpkg.py | 24 | ||||
-rw-r--r-- | schema.sql | 13 | ||||
-rwxr-xr-x | update_sharing.py | 5 | ||||
-rwxr-xr-x | webapp.py | 34 |
6 files changed, 34 insertions, 58 deletions
@@ -30,17 +30,17 @@ SQL database by hand. Here are some example queries. Finding the 100 largest files shared with multiple packages. - SELECT a.package, a.filename, b.package, b.filename, a.size FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE (a.package != b.package OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100; + SELECT a.package, a.filename, b.package, b.filename, a.size FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE (a.package != b.package OR a.filename != b.filename) ORDER BY a.size DESC LIMIT 100; Finding those top 100 files that save most space when being reduced to only one copy in the archive. - SELECT hash, sum(size)-min(size), count(*), count(distinct package) FROM content GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100; + SELECT hash, sum(size)-min(size), count(*), count(distinct package) FROM content JOIN hash ON content.id = hash.cid WHERE hash.function = "sha512" GROUP BY hash ORDER BY sum(size)-min(size) DESC LIMIT 100; Finding PNG images that do not carry a .png file extension. - SELECT package, filename, size FROM content WHERE function = "image_sha512" AND filename NOT LIKE "%.png"; + SELECT package, filename, size FROM content JOIN hash ON content.id = hash.cid WHERE function = "image_sha512" AND filename NOT LIKE "%.png"; Finding .gz files which either are not gziped or contain errors. - SELECT package, filename FROM content AS a WHERE function = "sha512" AND filename LIKE "%.gz" AND (SELECT count(*) FROM content AS b WHERE b.package = a.package AND b.filename = a.filename AND b.function = "gzip_sha512") = 0; + SELECT a.package, a.filename FROM content AS a JOIN hash AS ha ON a.id = ha.cid WHERE ha.function = "sha512" AND a.filename LIKE "%.gz" AND (SELECT count(*) FROM content AS b JOIN hash AS hb ON b.id = hb.cid WHERE b.package = a.package AND b.filename = a.filename AND hb.function = "gzip_sha512") = 0; diff --git a/autoimport.py b/autoimport.py index a7ea5ff..453a839 100755 --- a/autoimport.py +++ b/autoimport.py @@ -14,6 +14,7 @@ def main(): urlbase = sys.argv[1] db = sqlite3.connect("test.sqlite3") cur = db.cursor() + cur.execute("PRAGMA foreign_keys = ON;") cur.execute("SELECT package, version FROM package;") knownpkgs = dict((row[0], row[1]) for row in cur.fetchall()) @@ -37,13 +38,10 @@ def main(): delpkgs = set(knownpkgs) - distpkgs print("clearing packages %s" % " ".join(delpkgs)) - cur.execute("PRAGMA foreign_keys=1;") - cur.executemany("DELETE FROM content WHERE package = ?;", - ((pkg,) for pkg in delpkgs)) - cur.executemany("DELETE FROM dependency WHERE package = ?;", - ((pkg,) for pkg in delpkgs)) cur.executemany("DELETE FROM package WHERE package = ?;", ((pkg,) for pkg in delpkgs)) + # Tables content, dependency and sharing will also be pruned + # due to ON DELETE CASCADE clauses. db.commit() if __name__ == "__main__": diff --git a/importpkg.py b/importpkg.py index c6ce7f9..e0160e6 100755 --- a/importpkg.py +++ b/importpkg.py @@ -50,13 +50,16 @@ def get_hashes(tar): continue hasher = MultiHash(sha512_nontrivial(), gziphash(), imagehash()) hasher = hash_file(hasher, tar.extractfile(elem)) + hashes = {} for hashobj in hasher.hashes: hashvalue = hashobj.hexdigest() if hashvalue: - yield (elem.name, elem.size, hashobj.name, hashvalue) + hashes[hashobj.name] = hashvalue + yield (elem.name, elem.size, hashes) def process_package(db, filelike): cur = db.cursor() + cur.execute("PRAGMA foreign_keys = ON;") af = ArReader(filelike) af.read_magic() state = "start" @@ -92,12 +95,10 @@ def process_package(db, filelike): if row and version_compare(row[0], version) > 0: return # already seen a newer package - cur.execute("DELETE FROM package WHERE package = ?;", - (package,)) cur.execute("DELETE FROM content WHERE package = ?;", (package,)) - cur.execute("INSERT INTO package (package, version, architecture) VALUES (?, ?, ?);", - (package, version, architecture)) + cur.execute("INSERT OR REPLACE INTO package (package, version, architecture, source) VALUES (?, ?, ?, ?);", + (package, version, architecture, source)) depends = control.relations.get("depends", []) depends = set(dep[0]["name"].encode("ascii") for dep in depends if len(dep) == 1) @@ -105,10 +106,6 @@ def process_package(db, filelike): (package,)) cur.executemany("INSERT INTO dependency (package, required) VALUES (?, ?);", ((package, dep) for dep in depends)) - cur.execute("DELETE FROM source WHERE package = ?;", - (package,)) - cur.execute("INSERT INTO source (source, package) VALUES (?, ?);", - (source, package)) break continue elif name == "data.tar.gz": @@ -122,14 +119,17 @@ def process_package(db, filelike): continue if state != "control_file": raise ValueError("missing control file") - for name, size, function, hexhash in get_hashes(tf): + for name, size, hashes in get_hashes(tf): try: name = name.decode("utf8") except UnicodeDecodeError: print("warning: skipping filename with encoding error") continue # skip files with non-utf8 encoding for now - cur.execute("INSERT INTO content (package, filename, size, function, hash) VALUES (?, ?, ?, ?, ?);", - (package, name, size, function, hexhash)) + cur.execute("INSERT INTO content (package, filename, size) VALUES (?, ?, ?);", + (package, name, size)) + cid = cur.lastrowid + cur.executemany("INSERT INTO hash (cid, function, hash) VALUES (?, ?, ?);", + ((cid, func, hexhash) for func, hexhash in hashes.items())) db.commit() return raise ValueError("data.tar not found") @@ -1,10 +1,11 @@ -CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT); -CREATE TABLE content (package TEXT, filename TEXT, size INTEGER, function TEXT, hash TEXT, FOREIGN KEY (package) REFERENCES package(package)); -CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package), FOREIGN KEY (required) REFERENCES package(package)); -CREATE TABLE source (source TEXT, package TEXT, FOREIGN KEY (package) REFERENCES package(package)); +CREATE TABLE package (package TEXT PRIMARY KEY, version TEXT, architecture TEXT, source TEXT); +CREATE TABLE content (id INTEGER PRIMARY KEY, package TEXT, filename TEXT, size INTEGER, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE); +CREATE TABLE hash (cid INTEGER, function TEXT, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE); +CREATE TABLE dependency (package TEXT, required TEXT, FOREIGN KEY (package) REFERENCES package(package) ON DELETE CASCADE); CREATE INDEX content_package_index ON content (package); -CREATE INDEX content_hash_index ON content (hash); +CREATE INDEX hash_cid_index ON hash (cid); +CREATE INDEX hash_hash_index ON hash (hash); -CREATE TABLE sharing (package1 TEXT, package2 TEXT, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER); +CREATE TABLE sharing (package1 TEXT, package2 TEXT, func1 TEXT, func2 TEXT, files INTEGER, size INTEGER, FOREIGN KEY (package1) REFERENCES package(package) ON DELETE CASCADE, FOREIGN KEY (package2) REFERENCES package(package) ON DELETE CASCADE); CREATE INDEX sharing_insert_index ON sharing (package1, package2, func1, func2); CREATE INDEX sharing_package_index ON sharing (package1); diff --git a/update_sharing.py b/update_sharing.py index dd12e5b..b45e40b 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -40,11 +40,12 @@ def process_pkgdict(cursor, pkgdict): def main(): db = sqlite3.connect("test.sqlite3") cur = db.cursor() + cur.execute("PRAGMA foreign_keys = ON;") cur.execute("DELETE FROM sharing;") readcur = db.cursor() - readcur.execute("SELECT hash FROM content GROUP BY hash HAVING count(*) > 1;") + readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): - cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;", + cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", (hashvalue,)) rows = cur.fetchall() print("processing hash %s with %d entries" % (hashvalue, len(rows))) @@ -252,30 +252,6 @@ class Application(object): (package,)) return set(row[0] for row in fetchiter(cur)) - def compute_sharedstats(self, package): - cur = self.db.cursor() - sharedstats = {} - for func1, func2 in hash_functions: - cur.execute("SELECT a.filename, a.hash, a.size, b.package FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND a.function = ? AND b.function = ? AND (a.filename != b.filename OR b.package != ?);", - (package, func1, func2, package)) - sharing = dict() - for afile, hashval, size, bpkg in fetchiter(cur): - hashdict = sharing.setdefault(bpkg, dict()) - fileset = hashdict.setdefault(hashval, (size, set()))[1] - fileset.add(afile) - if sharing: - sharedstats[function_combination(func1, func2)] = curstats = [] - mapping = sharing.pop(package, dict()) - if mapping: - duplicate = sum(len(files) for _, files in mapping.values()) - savable = sum(size * (len(files) - 1) for size, files in mapping.values()) - curstats.append(dict(package=None, duplicate=duplicate, savable=savable)) - for pkg, mapping in sharing.items(): - duplicate = sum(len(files) for _, files in mapping.values()) - savable = sum(size * len(files) for size, files in mapping.values()) - curstats.append(dict(package=pkg, duplicate=duplicate, savable=savable)) - return sharedstats - def cached_sharedstats(self, package): cur = self.db.cursor() sharedstats = {} @@ -302,13 +278,13 @@ class Application(object): if package1 == package2: details1 = details2 = self.get_details(package1) - cur.execute("SELECT a.filename, a.size, a.function, b.filename, b.size, b.function, a.hash FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND b.package = ? AND a.filename != b.filename ORDER BY a.size DESC, a.filename, b.filename;", + cur.execute("SELECT a.filename, a.size, ha.function, b.filename, b.size, hb.function, ha.hash FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE a.package = ? AND b.package = ? AND a.filename != b.filename ORDER BY a.size DESC, a.filename, b.filename;", (package1, package1)) else: details1 = self.get_details(package1) details2 = self.get_details(package2) - cur.execute("SELECT a.filename, a.size, a.function, b.filename, b.size, b.function, a.hash FROM content AS a JOIN content AS b ON a.hash = b.hash WHERE a.package = ? AND b.package = ? ORDER BY a.size DESC, a.filename, b.filename;", + cur.execute("SELECT a.filename, a.size, ha.function, b.filename, b.size, hb.function, ha.hash FROM content AS a JOIN hash AS ha ON a.id = ha.cid JOIN hash AS hb ON ha.hash = hb.hash JOIN content AS b ON b.id = hb.cid WHERE a.package = ? AND b.package = ? ORDER BY a.size DESC, a.filename, b.filename;", (package1, package2)) shared = generate_shared(fetchiter(cur)) # The cursor will be in use until the template is fully rendered. @@ -320,7 +296,7 @@ class Application(object): def show_hash(self, function, hashvalue): cur = self.db.cursor() - cur.execute("SELECT package, filename, size, function FROM content WHERE hash = ?;", + cur.execute("SELECT content.package, content.filename, content.size, hash.function FROM content JOIN hash ON content.id = hash.cid WHERE hash = ?;", (hashvalue,)) entries = [dict(package=package, filename=filename, size=size, function=otherfunc) @@ -333,12 +309,12 @@ class Application(object): def show_source(self, package): cur = self.db.cursor() - cur.execute("SELECT package FROM source WHERE source = ?;", + cur.execute("SELECT package FROM package WHERE source = ?;", (package,)) binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) if not binpkgs: raise NotFound - cur.execute("SELECT source.package, sharing.package2, sharing.func1, sharing.func2, sharing.files, sharing.size FROM source JOIN sharing ON source.package = sharing.package1 WHERE source.source = ?;", + cur.execute("SELECT package.package, sharing.package2, sharing.func1, sharing.func2, sharing.files, sharing.size FROM package JOIN sharing ON package.package = sharing.package1 WHERE package.source = ?;", (package,)) for binary, otherbin, func1, func2, files, size in fetchiter(cur): entry = dict(package=otherbin, |