diff options
-rwxr-xr-x | autoimport.py | 61 | ||||
-rw-r--r-- | dedup/debpkg.py | 9 | ||||
-rw-r--r-- | dedup/templates/binary.html | 9 | ||||
-rw-r--r-- | dedup/templates/compare.html | 14 | ||||
-rw-r--r-- | dedup/templates/hash.html | 2 | ||||
-rwxr-xr-x | readyaml.py | 13 | ||||
-rw-r--r-- | schema.sql | 6 | ||||
-rwxr-xr-x | webapp.py | 79 |
8 files changed, 115 insertions, 78 deletions
diff --git a/autoimport.py b/autoimport.py index 9c4c73e..5c2c9be 100755 --- a/autoimport.py +++ b/autoimport.py @@ -26,26 +26,27 @@ def process_http(pkgs, url): pkglist = io.BytesIO(pkglist) pkglist = deb822.Packages.iter_paragraphs(pkglist) for pkg in pkglist: - name = pkg["Package"] - if name in pkgs and \ - version_compare(pkgs[name]["version"], pkg["Version"]) > 0: + key = (pkg["Package"], pkg["Architecture"]) + if key in pkgs and \ + version_compare(pkgs[key]["version"], pkg["Version"]) > 0: continue - pkgs[name] = dict(version=pkg["Version"], - filename="%s/%s" % (url, pkg["Filename"]), - sha256hash=pkg["SHA256"]) + pkgs[key] = dict(version=pkg["Version"], + filename="%s/%s" % (url, pkg["Filename"]), + sha256hash=pkg["SHA256"]) def process_file(pkgs, filename): base = os.path.basename(filename) if not base.endswith(".deb"): raise ValueError("filename does not end in .deb") - parts = base.split("_") + parts = base[:-4].split("_") if len(parts) != 3: raise ValueError("filename not in form name_version_arch.deb") - name, version, _ = parts + name, version, architecture = parts + key = (name, architecture) version = urllib.unquote(version) - if name in pkgs and version_compare(pkgs[name]["version"], version) > 0: + if key in pkgs and version_compare(pkgs[key]["version"], version) > 0: return - pkgs[name] = dict(version=version, filename=filename) + pkgs[key] = dict(version=version, filename=filename) def process_dir(pkgs, d): for entry in os.listdir(d): @@ -54,7 +55,7 @@ def process_dir(pkgs, d): except ValueError: pass -def process_pkg(name, pkgdict, outpath): +def process_pkg(key, pkgdict, outpath): filename = pkgdict["filename"] print("importing %s" % filename) importcmd = ["python", "importpkg.py"] @@ -75,7 +76,7 @@ def process_pkg(name, pkgdict, outpath): with open(outpath, "w") as outp: subprocess.check_call(importcmd, stdin=inp, stdout=outp, close_fds=True) - print("preprocessed %s" % name) + print("preprocessed %s:%s" % key) def main(): parser = optparse.OptionParser() @@ -103,42 +104,44 @@ def main(): process_file(pkgs, d) print("reading database") - cur.execute("SELECT name, version FROM package;") - knownpkgs = dict((row[0], row[1]) for row in cur.fetchall()) + cur.execute("SELECT name, architecture, version FROM package;") + knownpkgs = dict(((row[0], row[1]), row[2]) for row in cur.fetchall()) distpkgs = set(pkgs.keys()) if options.new: - for name in distpkgs: - if name in knownpkgs and version_compare(pkgs[name]["version"], - knownpkgs[name]) <= 0: - del pkgs[name] + for key in distpkgs: + if key in knownpkgs and version_compare(pkgs[key]["version"], + knownpkgs[key]) <= 0: + del pkgs[key] knownpkgs = set(knownpkgs) with e: fs = {} - for name, pkg in pkgs.items(): - outpath = os.path.join(tmpdir, name) - fs[e.submit(process_pkg, name, pkg, outpath)] = name + for key, pkg in pkgs.items(): + outpath = os.path.join(tmpdir, "%s_%s" % key) + fs[e.submit(process_pkg, key, pkg, outpath)] = key for f in concurrent.futures.as_completed(fs.keys()): - name = fs[f] + key = fs[f] if f.exception(): - print("%s failed to import: %r" % (name, f.exception())) + print("%s:%s failed to import: %r" % + (key[0], key[1], f.exception())) continue - inf = os.path.join(tmpdir, name) - print("sqlimporting %s" % name) + inf = os.path.join(tmpdir, "%s_%s" % key) + print("sqlimporting %s:%s" % key) with open(inf) as inp: try: readyaml(db, inp) except Exception as exc: - print("%s failed sql with exception %r" % (name, exc)) + print("%s:%s failed sql with exception %r" % + (key[0], key[1], exc)) else: os.unlink(inf) if options.prune: delpkgs = knownpkgs - distpkgs - print("clearing packages %s" % " ".join(delpkgs)) - cur.executemany("DELETE FROM package WHERE name = ?;", - ((pkg,) for pkg in delpkgs)) + print("clearing packages %s" % " ".join(map("%s:%s".__mod__, delpkgs))) + cur.executemany("DELETE FROM package WHERE name = ? AND architecture = ?;", + delpkgs) # Tables content, dependency and sharing will also be pruned # due to ON DELETE CASCADE clauses. db.commit() diff --git a/dedup/debpkg.py b/dedup/debpkg.py index dbee849..cd8616f 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -20,8 +20,13 @@ def process_control(control_contents): depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii") for dep in control.relations.get("depends", ()) if len(dep) == 1) - return dict(package=package, source=source, version=version, - architecture=architecture, depends=depends) + ret = dict(package=package, source=source, version=version, + architecture=architecture, depends=depends) + try: + ret["multiarch"] = control["multi-arch"].encode("ascii") + except KeyError: + pass + return ret class MultiHash(object): def __init__(self, *hashes): diff --git a/dedup/templates/binary.html b/dedup/templates/binary.html index 46c4fa6..bf9aad8 100644 --- a/dedup/templates/binary.html +++ b/dedup/templates/binary.html @@ -1,6 +1,7 @@ {% extends "base.html" %} -{% block title %}duplication of {{ package|e }}{% endblock %} -{% block content %}<h1>{{ package|e }}</h1> +{% block title %}duplication of {{ package|e }}:{{ architecture|e }}{% endblock %} +{% block content %}<h1>{{ package|e }}:{{ architecture|e }}</h1> +<p>Package: {{ package|e }}</p> <p>Version: {{ version|e }}</p> <p>Architecture: {{ architecture|e }}</p> <p>Number of files: {{ num_files }}</p> @@ -11,8 +12,8 @@ <table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr> {%- for entry in sharing|sort(attribute="savable", reverse=true) -%} <tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}> - {%- if entry.package %}<a href="{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a>{% else %}self{% endif %} - <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td> + {%- if entry.package %}<a href="./{{ entry.package|e }}:{{ entry.architecture|e }}"><span class="binary-package">{{ entry.package|e }}:{{ entry.architecture|e}}</span></a>{% else %}self{% endif %} + <a href="../compare/{{ package|e }}:{{ architecture|e }}/{{ entry.package|default(package, true)|e }}:{{ entry.architecture|default(architecture, true)|e }}">compare</a></td> <td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td> <td>{{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr> {%- endfor -%} diff --git a/dedup/templates/compare.html b/dedup/templates/compare.html index f78e80f..7d4564c 100644 --- a/dedup/templates/compare.html +++ b/dedup/templates/compare.html @@ -1,14 +1,12 @@ {% extends "base.html" %} -{% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%} +{% block title %}sharing between {{ details1.package|e }}:{{ details1.architecture|e }} and {{ details2.package|e }}:{{ details2.architecture|e }}{% endblock%} {% block content %} -<h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> <-> <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1> -<p>Version of {{ details1.package|e }}: {{ details1.version|e }}</p> -<p>Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}</p> -{%- if details1.package != details2.package -%} -<p>Version of {{ details2.package|e }}: {{ details2.version|e }}</p> -<p>Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}</p> +<h1><a href="../../binary/{{ details1.package|e }}:{{ details1.architecture|e }}">{{ details1.package|e }}:{{ details1.architecture|e }}</a> <-> <a href="../../binary/{{ details2.package|e }}:{{ details2.architecture|e }}">{{ details2.package|e }}:{{ details2.architecture|e }}</a></h1> +<p>Version of {{ details1.package|e }}:{{ details1.architecture|e }}: {{ details1.version|e }}</p> +{%- if details1.package != details2.package or details1.architecture != details2.architecture -%} +<p>Version of {{ details2.package|e }}:{{ details2.architecture|e }}: {{ details2.version|e }}</p> {%- endif -%} -<table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr> +<table border='1'><tr><th colspan="2">{{ details1.package|e }}:{{ details1.architecture|e }}</th><th colspan="2">{{ details2.package|e }}:{{ details2.architecture|e }}</th></tr> <tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr> {%- for entry in shared -%} <tr><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }}</td><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}> diff --git a/dedup/templates/hash.html b/dedup/templates/hash.html index 7141f96..f700309 100644 --- a/dedup/templates/hash.html +++ b/dedup/templates/hash.html @@ -4,7 +4,7 @@ <h1>{{ function|e }} {{ hashvalue|e }}</h1> <table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr> {%- for entry in entries -%} - <tr><td><a href="../../binary/{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a></td> + <tr><td><a href="../../binary/{{ entry.package|e }}:{{ entry.architecture|e }}"><span class="binary-package">{{ entry.package|e }}:{{ entry.architecture|e }}</span></a></td> <td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|filesizeformat }}</td> <td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr> {%- endfor -%} diff --git a/readyaml.py b/readyaml.py index 2ef9a3b..6940f94 100755 --- a/readyaml.py +++ b/readyaml.py @@ -15,8 +15,8 @@ def readyaml(db, stream): gen = yaml.safe_load_all(stream) metadata = next(gen) package = metadata["package"] - cur.execute("SELECT id, version FROM package WHERE name = ?;", - (package,)) + cur.execute("SELECT id, version FROM package WHERE name = ? AND architecture = ?;", + (package, metadata["architecture"])) row = cur.fetchone() if row: pid, version = row @@ -31,12 +31,13 @@ def readyaml(db, stream): if pid is not None: cur.execute("DELETE FROM content WHERE pid = ?;", (pid,)) cur.execute("DELETE FROM dependency WHERE pid = ?;", (pid,)) - cur.execute("UPDATE package SET version = ?, architecture = ?, source = ? WHERE id = ?;", - (metadata["version"], metadata["architecture"], metadata["source"], pid)) + cur.execute("UPDATE package SET version = ?, source = ?, multiarch = ? WHERE id = ?;", + (metadata["version"], metadata["source"], + metadata.get("multiarch"), pid)) else: - cur.execute("INSERT INTO package (name, version, architecture, source) VALUES (?, ?, ?, ?);", + cur.execute("INSERT INTO package (name, version, architecture, source, multiarch) VALUES (?, ?, ?, ?, ?);", (package, metadata["version"], metadata["architecture"], - metadata["source"])) + metadata["source"], metadata.get("multiarch"))) pid = cur.lastrowid cur.executemany("INSERT INTO dependency (pid, required) VALUES (?, ?);", ((pid, dep) for dep in metadata["depends"])) @@ -1,10 +1,12 @@ CREATE TABLE package ( -- binary Debian packages id INTEGER PRIMARY KEY, - name TEXT UNIQUE NOT NULL, + name TEXT NOT NULL, -- binary package name version TEXT, -- Debian version architecture TEXT, - source TEXT); -- name of the source package it was built from + source TEXT, -- name of the source package it was built from + multiarch TEXT, -- value of the Multi-Arch field + UNIQUE (name, architecture)); CREATE TABLE content ( -- a file contained in a binary package id INTEGER PRIMARY KEY, @@ -106,25 +106,28 @@ class Application(object): except HTTPException as e: return e - def get_details(self, package): + def guess_package(self, package): with contextlib.closing(self.db.cursor()) as cur: - cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;", + cur.execute("SELECT architecture, id FROM package WHERE name = ?;", (package,)) + ret = dict(cur.fetchall()) + if not ret: + raise NotFound() + return ret + + def get_details(self, package, architecture): + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT package.id, package.version, count(content.filename), sum(content.size) FROM package JOIN content ON package.id = content.pid WHERE name = ? AND architecture = ? GROUP BY package.id;", + (package, architecture)) row = cur.fetchone() - if not row: - raise NotFound() - pid, version, architecture = row - details = dict(pid=pid, - package=package, - version=version, - architecture=architecture) - cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;", - (pid,)) - num_files, total_size = cur.fetchone() + if not row: + raise NotFound() + pid, version, num_files, total_size = row if total_size is None: total_size = 0 - details.update(dict(num_files=num_files, total_size=total_size)) - return details + return dict(pid=pid, package=package, version=version, + architecture=architecture, num_files=num_files, + total_size=total_size) def get_dependencies(self, pid): with contextlib.closing(self.db.cursor()) as cur: @@ -135,19 +138,27 @@ class Application(object): def cached_sharedstats(self, pid): sharedstats = {} with contextlib.closing(self.db.cursor()) as cur: - cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", + cur.execute("SELECT pid2, package.name, package.architecture, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", (pid,)) - for pid2, package2, func1, func2, files, size in fetchiter(cur): + for pid2, package2, architecture2, func1, func2, files, size in fetchiter(cur): curstats = sharedstats.setdefault( function_combination(func1, func2), list()) if pid2 == pid: package2 = None - curstats.append(dict(package=package2, duplicate=files, - savable=size)) + architecture2 = None + curstats.append(dict(package=package2, + architecture=architecture2, + duplicate=files, savable=size)) return sharedstats def show_package(self, package): - params = self.get_details(package) + if ':' in package: + package, architecture = package.split(':', 1) + else: + architecture = min(self.guess_package(package)) + raise InternalRedirect("/binary/%s:%s" % (package, architecture), + code=302) + params = self.get_details(package, architecture) params["dependencies"] = self.get_dependencies(params["pid"]) params["shared"] = self.cached_sharedstats(params["pid"]) params["urlroot"] = ".." @@ -203,9 +214,24 @@ class Application(object): yield entry def show_detail(self, package1, package2): - details1 = details2 = self.get_details(package1) - if package1 != package2: - details2 = self.get_details(package2) + guessed = False + if ':' in package1: + package1, architecture1 = package1.split(':', 1) + else: + architecture1 = min(self.guess_package(package1)) + guessed = True + if ':' in package2: + package2, architecture2 = package2.split(':', 1) + else: + architecture2 = min(self.guess_package(package2)) + guessed = True + if guessed: + raise InternalRedirect("/compare/%s:%s/%s:%s" % + (package1, architecture1, package2, + architecture2), code=302) + details1 = details2 = self.get_details(package1, architecture1) + if package1 != package2 or architecture1 != architecture2: + details2 = self.get_details(package2, architecture2) shared = self.compute_comparison(details1["pid"], details2["pid"]) params = dict( @@ -217,11 +243,12 @@ class Application(object): def show_hash(self, function, hashvalue): with contextlib.closing(self.db.cursor()) as cur: - cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", + cur.execute("SELECT package.name, package.architecture, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", (function, hashvalue,)) - entries = [dict(package=package, filename=filename, size=size, - function=otherfunc) - for package, filename, size, otherfunc in fetchiter(cur)] + entries = [dict(package=package, architecture=architecture, + filename=filename, size=size, function=otherfunc) + for package, architecture, filename, size, otherfunc + in fetchiter(cur)] if not entries: # Assumption: '~' serves as an infinite character larger than # any other character in the hash column. |