From 7f3d8c5098d23bb43b86e060700170cc41cd22aa Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sat, 7 Sep 2013 21:04:40 +0200 Subject: permit multiple architectures per package While the importer can easily cope with this change, the web presentation still needs fixing. It works somewhat now. --- autoimport.py | 61 ++++++++++++++++---------------- dedup/templates/binary.html | 9 ++--- dedup/templates/compare.html | 14 ++++---- dedup/templates/hash.html | 2 +- readyaml.py | 8 ++--- schema.sql | 2 +- webapp.py | 82 ++++++++++++++++++++++++++++++-------------- 7 files changed, 105 insertions(+), 73 deletions(-) diff --git a/autoimport.py b/autoimport.py index d44c012..c7eb2f5 100755 --- a/autoimport.py +++ b/autoimport.py @@ -26,26 +26,27 @@ def process_http(pkgs, url): pkglist = io.BytesIO(pkglist) pkglist = deb822.Packages.iter_paragraphs(pkglist) for pkg in pkglist: - name = pkg["Package"] - if name in pkgs and \ - version_compare(pkgs[name]["version"], pkg["Version"]) > 0: + key = (pkg["Package"], pkg["Architecture"]) + if key in pkgs and \ + version_compare(pkgs[key]["version"], pkg["Version"]) > 0: continue - pkgs[name] = dict(version=pkg["Version"], - filename="%s/%s" % (url, pkg["Filename"]), - sha256hash=pkg["SHA256"]) + pkgs[key] = dict(version=pkg["Version"], + filename="%s/%s" % (url, pkg["Filename"]), + sha256hash=pkg["SHA256"]) def process_file(pkgs, filename): base = os.path.basename(filename) if not base.endswith(".deb"): raise ValueError("filename does not end in .deb") - parts = base.split("_") + parts = base[:-4].split("_") if len(parts) != 3: raise ValueError("filename not in form name_version_arch.deb") - name, version, _ = parts + name, version, architecture = parts + key = (name, architecture) version = urllib.unquote(version) - if name in pkgs and version_compare(pkgs[name]["version"], version) > 0: + if key in pkgs and version_compare(pkgs[key]["version"], version) > 0: return - pkgs[name] = dict(version=version, filename=filename) + pkgs[key] = dict(version=version, filename=filename) def process_dir(pkgs, d): for entry in os.listdir(d): @@ -54,7 +55,7 @@ def process_dir(pkgs, d): except ValueError: pass -def process_pkg(name, pkgdict, outpath): +def process_pkg(key, pkgdict, outpath): filename = pkgdict["filename"] print("importing %s" % filename) importcmd = ["python", "importpkg.py"] @@ -75,7 +76,7 @@ def process_pkg(name, pkgdict, outpath): with open(outpath, "w") as outp: subprocess.check_call(importcmd, stdin=inp, stdout=outp, close_fds=True) - print("preprocessed %s" % name) + print("preprocessed %s:%s" % key) def main(): parser = optparse.OptionParser() @@ -103,42 +104,44 @@ def main(): process_file(pkgs, d) print("reading database") - cur.execute("SELECT name, version FROM package;") - knownpkgs = dict((row[0], row[1]) for row in cur.fetchall()) + cur.execute("SELECT name, architecture, version FROM package;") + knownpkgs = dict(((row[0], row[1]), row[2]) for row in cur.fetchall()) distpkgs = set(pkgs.keys()) if options.new: - for name in distpkgs: - if name in knownpkgs and version_compare(pkgs[name]["version"], - knownpkgs[name]) <= 0: - del pkgs[name] + for key in distpkgs: + if key in knownpkgs and version_compare(pkgs[key]["version"], + knownpkgs[key]) <= 0: + del pkgs[key] knownpkgs = set(knownpkgs) with e: fs = {} - for name, pkg in pkgs.items(): - outpath = os.path.join(tmpdir, name) - fs[e.submit(process_pkg, name, pkg, outpath)] = name + for key, pkg in pkgs.items(): + outpath = os.path.join(tmpdir, "%s_%s" % key) + fs[e.submit(process_pkg, key, pkg, outpath)] = key for f in concurrent.futures.as_completed(fs.keys()): - name = fs[f] + key = fs[f] if f.exception(): - print("%s failed to import: %r" % (name, f.exception())) + print("%s:%s failed to import: %r" % + (key[0], key[1], f.exception())) continue - inf = os.path.join(tmpdir, name) - print("sqlimporting %s" % name) + inf = os.path.join(tmpdir, "%s_%s" % key) + print("sqlimporting %s:%s" % key) with open(inf) as inp: try: readyaml(db, inp) except Exception as exc: - print("%s failed sql with exception %r" % (name, exc)) + print("%s:%s failed sql with exception %r" % + (key[0], key[1], exc)) else: os.unlink(inf) if options.prune: delpkgs = knownpkgs - distpkgs - print("clearing packages %s" % " ".join(delpkgs)) - cur.executemany("DELETE FROM package WHERE name = ?;", - ((pkg,) for pkg in delpkgs)) + print("clearing packages %s" % " ".join(map("%s:%s".__mod__, delpkgs))) + cur.executemany("DELETE FROM package WHERE name = ? AND architecture = ?;", + delpkgs) # Tables content, dependency and sharing will also be pruned # due to ON DELETE CASCADE clauses. db.commit() diff --git a/dedup/templates/binary.html b/dedup/templates/binary.html index 46c4fa6..bf9aad8 100644 --- a/dedup/templates/binary.html +++ b/dedup/templates/binary.html @@ -1,6 +1,7 @@ {% extends "base.html" %} -{% block title %}duplication of {{ package|e }}{% endblock %} -{% block content %}

{{ package|e }}

+{% block title %}duplication of {{ package|e }}:{{ architecture|e }}{% endblock %} +{% block content %}

{{ package|e }}:{{ architecture|e }}

+

Package: {{ package|e }}

Version: {{ version|e }}

Architecture: {{ architecture|e }}

Number of files: {{ num_files }}

@@ -11,8 +12,8 @@ {%- for entry in sharing|sort(attribute="savable", reverse=true) -%} - {%- if entry.package %}{{ entry.package|e }}{% else %}self{% endif %} - compare + {%- if entry.package %}{{ entry.package|e }}:{{ entry.architecture|e}}{% else %}self{% endif %} + compare {%- endfor -%} diff --git a/dedup/templates/compare.html b/dedup/templates/compare.html index f78e80f..7d4564c 100644 --- a/dedup/templates/compare.html +++ b/dedup/templates/compare.html @@ -1,14 +1,12 @@ {% extends "base.html" %} -{% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%} +{% block title %}sharing between {{ details1.package|e }}:{{ details1.architecture|e }} and {{ details2.package|e }}:{{ details2.architecture|e }}{% endblock%} {% block content %} -

{{ details1.package|e }} <-> {{ details2.package|e }}

-

Version of {{ details1.package|e }}: {{ details1.version|e }}

-

Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}

-{%- if details1.package != details2.package -%} -

Version of {{ details2.package|e }}: {{ details2.version|e }}

-

Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}

+

{{ details1.package|e }}:{{ details1.architecture|e }} <-> {{ details2.package|e }}:{{ details2.architecture|e }}

+

Version of {{ details1.package|e }}:{{ details1.architecture|e }}: {{ details1.version|e }}

+{%- if details1.package != details2.package or details1.architecture != details2.architecture -%} +

Version of {{ details2.package|e }}:{{ details2.architecture|e }}: {{ details2.version|e }}

{%- endif -%} -
packagefiles shareddata shared
{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%) {{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)
+
{{ details1.package|e }}{{ details2.package|e }}
{%- for entry in shared -%} 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }} 1 %} rowspan={{ entry.matches|length }}{% endif %}> diff --git a/dedup/templates/hash.html b/dedup/templates/hash.html index 7141f96..f700309 100644 --- a/dedup/templates/hash.html +++ b/dedup/templates/hash.html @@ -4,7 +4,7 @@

{{ function|e }} {{ hashvalue|e }}

{{ details1.package|e }}:{{ details1.architecture|e }}{{ details2.package|e }}:{{ details2.architecture|e }}
sizefilenamehash functionsfilename
{%- for entry in entries -%} - + {%- endfor -%} diff --git a/readyaml.py b/readyaml.py index 2ef9a3b..cd9e5db 100755 --- a/readyaml.py +++ b/readyaml.py @@ -15,8 +15,8 @@ def readyaml(db, stream): gen = yaml.safe_load_all(stream) metadata = next(gen) package = metadata["package"] - cur.execute("SELECT id, version FROM package WHERE name = ?;", - (package,)) + cur.execute("SELECT id, version FROM package WHERE name = ? AND architecture = ?;", + (package, metadata["architecture"])) row = cur.fetchone() if row: pid, version = row @@ -31,8 +31,8 @@ def readyaml(db, stream): if pid is not None: cur.execute("DELETE FROM content WHERE pid = ?;", (pid,)) cur.execute("DELETE FROM dependency WHERE pid = ?;", (pid,)) - cur.execute("UPDATE package SET version = ?, architecture = ?, source = ? WHERE id = ?;", - (metadata["version"], metadata["architecture"], metadata["source"], pid)) + cur.execute("UPDATE package SET version = ?, source = ? WHERE id = ?;", + (metadata["version"], metadata["source"], pid)) else: cur.execute("INSERT INTO package (name, version, architecture, source) VALUES (?, ?, ?, ?);", (package, metadata["version"], metadata["architecture"], diff --git a/schema.sql b/schema.sql index 2ab7ca7..7098a71 100644 --- a/schema.sql +++ b/schema.sql @@ -1,4 +1,4 @@ -CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT); +CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT, version TEXT, architecture TEXT, source TEXT, UNIQUE (name, architecture)); CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER); INSERT INTO function (name, eqclass) VALUES ("sha512", 1), ("gzip_sha512", 1), ("png_sha512", 2), ("gif_sha512", 2); diff --git a/webapp.py b/webapp.py index d5f076e..2819a83 100755 --- a/webapp.py +++ b/webapp.py @@ -60,6 +60,11 @@ def html_response(unicode_iterator, max_age=24 * 60 * 60): resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age) return resp +class InternalRedirect(Exception): + def __init__(self, target): + Exception.__init__(self) + self.target = target + class Application(object): def __init__(self, db): self.db = db @@ -83,9 +88,8 @@ class Application(object): elif endpoint == "hash": if args["function"] == "image_sha512": # backwards compatibility - raise RequestRedirect("%s/hash/png_sha512/%s" % - (request.environ["SCRIPT_NAME"], - args["hashvalue"])) + raise InternalRedirect("/hash/png_sha512/%s" % + args["hashvalue"]) return self.show_hash(args["function"], args["hashvalue"]) elif endpoint == "index": if not request.environ["PATH_INFO"]: @@ -94,28 +98,33 @@ class Application(object): elif endpoint == "source": return self.show_source(args["package"]) raise NotFound() + except InternalRedirect as r: + return RequestRedirect(request.environ["SCRIPT_NAME"] + r.target) except HTTPException as e: return e - def get_details(self, package): + def guess_package(self, package): cur = self.db.cursor() - cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;", + cur.execute("SELECT architecture, id FROM package WHERE name = ?;", (package,)) + ret = dict(cur.fetchall()) + if not ret: + raise NotFound() + return ret + + def get_details(self, package, architecture): + cur = self.db.cursor() + cur.execute("SELECT package.id, package.version, package.architecture, count(content.filename), sum(content.size) FROM package JOIN content ON package.id = content.pid WHERE name = ? AND architecture = ? GROUP BY package.id;", + (package, architecture)) row = cur.fetchone() if not row: raise NotFound() - pid, version, architecture = row - details = dict(pid=pid, - package=package, - version=version, - architecture=architecture) - cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;", - (pid,)) - num_files, total_size = cur.fetchone() + pid, version, architecture, num_files, total_size = row if total_size is None: total_size = 0 - details.update(dict(num_files=num_files, total_size=total_size)) - return details + return dict(pid=pid, package=package, version=version, + architecture=architecture, num_files=num_files, + total_size=total_size) def get_dependencies(self, pid): cur = self.db.cursor() @@ -126,18 +135,24 @@ class Application(object): def cached_sharedstats(self, pid): cur = self.db.cursor() sharedstats = {} - cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", + cur.execute("SELECT pid2, package.name, package.architecture, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", (pid,)) - for pid2, package2, func1, func2, files, size in fetchiter(cur): + for pid2, package2, architecture2, func1, func2, files, size in fetchiter(cur): curstats = sharedstats.setdefault( function_combination(func1, func2), list()) if pid2 == pid: package2 = None - curstats.append(dict(package=package2, duplicate=files, savable=size)) + architecture2 = None + curstats.append(dict(package=package2, architecture=architecture2, duplicate=files, savable=size)) return sharedstats def show_package(self, package): - params = self.get_details(package) + if ':' in package: + package, architecture = package.split(':', 1) + else: + architecture = min(self.guess_package(package)) + raise InternalRedirect("/binary/%s:%s" % (package, architecture)) + params = self.get_details(package, architecture) params["dependencies"] = self.get_dependencies(params["pid"]) params["shared"] = self.cached_sharedstats(params["pid"]) params["urlroot"] = ".." @@ -193,9 +208,24 @@ class Application(object): yield entry def show_detail(self, package1, package2): - details1 = details2 = self.get_details(package1) - if package1 != package2: - details2 = self.get_details(package2) + guessed = False + if ':' in package1: + package1, architecture1 = package1.split(':', 1) + else: + architecture1 = min(self.guess_package(package1)) + guessed = True + if ':' in package2: + package2, architecture2 = package2.split(':', 1) + else: + architecture2 = min(self.guess_package(package2)) + guessed = True + if guessed: + raise InternalRedirect("/compare/%s:%s/%s:%s" % + (package1, architecture1, package2, + architecture2)) + details1 = details2 = self.get_details(package1, architecture1) + if package1 != package2 or architecture1 != architecture2: + details2 = self.get_details(package2, architecture2) shared = self.compute_comparison(details1["pid"], details2["pid"]) params = dict( @@ -207,11 +237,11 @@ class Application(object): def show_hash(self, function, hashvalue): cur = self.db.cursor() - cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", + cur.execute("SELECT package.name, package.architecture, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", (function, hashvalue,)) - entries = [dict(package=package, filename=filename, size=size, - function=otherfunc) - for package, filename, size, otherfunc in fetchiter(cur)] + entries = [dict(package=package, architecture=architecture, + filename=filename, size=size, function=otherfunc) + for package, architecture, filename, size, otherfunc in fetchiter(cur)] if not entries: raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries, -- cgit v1.2.3
packagefilenamesizedifferent function
{{ entry.package|e }}
{{ entry.package|e }}:{{ entry.architecture|e }} {{ entry.filename|e }}{{ entry.size|filesizeformat }} {% if function != entry.function %}{{ entry.function|e }}{% endif %}