summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xautoimport.py61
-rw-r--r--dedup/debpkg.py9
-rw-r--r--dedup/templates/binary.html9
-rw-r--r--dedup/templates/compare.html14
-rw-r--r--dedup/templates/hash.html2
-rwxr-xr-xreadyaml.py13
-rw-r--r--schema.sql6
-rwxr-xr-xwebapp.py79
8 files changed, 115 insertions, 78 deletions
diff --git a/autoimport.py b/autoimport.py
index 9c4c73e..5c2c9be 100755
--- a/autoimport.py
+++ b/autoimport.py
@@ -26,26 +26,27 @@ def process_http(pkgs, url):
pkglist = io.BytesIO(pkglist)
pkglist = deb822.Packages.iter_paragraphs(pkglist)
for pkg in pkglist:
- name = pkg["Package"]
- if name in pkgs and \
- version_compare(pkgs[name]["version"], pkg["Version"]) > 0:
+ key = (pkg["Package"], pkg["Architecture"])
+ if key in pkgs and \
+ version_compare(pkgs[key]["version"], pkg["Version"]) > 0:
continue
- pkgs[name] = dict(version=pkg["Version"],
- filename="%s/%s" % (url, pkg["Filename"]),
- sha256hash=pkg["SHA256"])
+ pkgs[key] = dict(version=pkg["Version"],
+ filename="%s/%s" % (url, pkg["Filename"]),
+ sha256hash=pkg["SHA256"])
def process_file(pkgs, filename):
base = os.path.basename(filename)
if not base.endswith(".deb"):
raise ValueError("filename does not end in .deb")
- parts = base.split("_")
+ parts = base[:-4].split("_")
if len(parts) != 3:
raise ValueError("filename not in form name_version_arch.deb")
- name, version, _ = parts
+ name, version, architecture = parts
+ key = (name, architecture)
version = urllib.unquote(version)
- if name in pkgs and version_compare(pkgs[name]["version"], version) > 0:
+ if key in pkgs and version_compare(pkgs[key]["version"], version) > 0:
return
- pkgs[name] = dict(version=version, filename=filename)
+ pkgs[key] = dict(version=version, filename=filename)
def process_dir(pkgs, d):
for entry in os.listdir(d):
@@ -54,7 +55,7 @@ def process_dir(pkgs, d):
except ValueError:
pass
-def process_pkg(name, pkgdict, outpath):
+def process_pkg(key, pkgdict, outpath):
filename = pkgdict["filename"]
print("importing %s" % filename)
importcmd = ["python", "importpkg.py"]
@@ -75,7 +76,7 @@ def process_pkg(name, pkgdict, outpath):
with open(outpath, "w") as outp:
subprocess.check_call(importcmd, stdin=inp, stdout=outp,
close_fds=True)
- print("preprocessed %s" % name)
+ print("preprocessed %s:%s" % key)
def main():
parser = optparse.OptionParser()
@@ -103,42 +104,44 @@ def main():
process_file(pkgs, d)
print("reading database")
- cur.execute("SELECT name, version FROM package;")
- knownpkgs = dict((row[0], row[1]) for row in cur.fetchall())
+ cur.execute("SELECT name, architecture, version FROM package;")
+ knownpkgs = dict(((row[0], row[1]), row[2]) for row in cur.fetchall())
distpkgs = set(pkgs.keys())
if options.new:
- for name in distpkgs:
- if name in knownpkgs and version_compare(pkgs[name]["version"],
- knownpkgs[name]) <= 0:
- del pkgs[name]
+ for key in distpkgs:
+ if key in knownpkgs and version_compare(pkgs[key]["version"],
+ knownpkgs[key]) <= 0:
+ del pkgs[key]
knownpkgs = set(knownpkgs)
with e:
fs = {}
- for name, pkg in pkgs.items():
- outpath = os.path.join(tmpdir, name)
- fs[e.submit(process_pkg, name, pkg, outpath)] = name
+ for key, pkg in pkgs.items():
+ outpath = os.path.join(tmpdir, "%s_%s" % key)
+ fs[e.submit(process_pkg, key, pkg, outpath)] = key
for f in concurrent.futures.as_completed(fs.keys()):
- name = fs[f]
+ key = fs[f]
if f.exception():
- print("%s failed to import: %r" % (name, f.exception()))
+ print("%s:%s failed to import: %r" %
+ (key[0], key[1], f.exception()))
continue
- inf = os.path.join(tmpdir, name)
- print("sqlimporting %s" % name)
+ inf = os.path.join(tmpdir, "%s_%s" % key)
+ print("sqlimporting %s:%s" % key)
with open(inf) as inp:
try:
readyaml(db, inp)
except Exception as exc:
- print("%s failed sql with exception %r" % (name, exc))
+ print("%s:%s failed sql with exception %r" %
+ (key[0], key[1], exc))
else:
os.unlink(inf)
if options.prune:
delpkgs = knownpkgs - distpkgs
- print("clearing packages %s" % " ".join(delpkgs))
- cur.executemany("DELETE FROM package WHERE name = ?;",
- ((pkg,) for pkg in delpkgs))
+ print("clearing packages %s" % " ".join(map("%s:%s".__mod__, delpkgs)))
+ cur.executemany("DELETE FROM package WHERE name = ? AND architecture = ?;",
+ delpkgs)
# Tables content, dependency and sharing will also be pruned
# due to ON DELETE CASCADE clauses.
db.commit()
diff --git a/dedup/debpkg.py b/dedup/debpkg.py
index dbee849..cd8616f 100644
--- a/dedup/debpkg.py
+++ b/dedup/debpkg.py
@@ -20,8 +20,13 @@ def process_control(control_contents):
depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii")
for dep in control.relations.get("depends", ())
if len(dep) == 1)
- return dict(package=package, source=source, version=version,
- architecture=architecture, depends=depends)
+ ret = dict(package=package, source=source, version=version,
+ architecture=architecture, depends=depends)
+ try:
+ ret["multiarch"] = control["multi-arch"].encode("ascii")
+ except KeyError:
+ pass
+ return ret
class MultiHash(object):
def __init__(self, *hashes):
diff --git a/dedup/templates/binary.html b/dedup/templates/binary.html
index 46c4fa6..bf9aad8 100644
--- a/dedup/templates/binary.html
+++ b/dedup/templates/binary.html
@@ -1,6 +1,7 @@
{% extends "base.html" %}
-{% block title %}duplication of {{ package|e }}{% endblock %}
-{% block content %}<h1>{{ package|e }}</h1>
+{% block title %}duplication of {{ package|e }}:{{ architecture|e }}{% endblock %}
+{% block content %}<h1>{{ package|e }}:{{ architecture|e }}</h1>
+<p>Package: {{ package|e }}</p>
<p>Version: {{ version|e }}</p>
<p>Architecture: {{ architecture|e }}</p>
<p>Number of files: {{ num_files }}</p>
@@ -11,8 +12,8 @@
<table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>
{%- for entry in sharing|sort(attribute="savable", reverse=true) -%}
<tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}>
- {%- if entry.package %}<a href="{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a>{% else %}self{% endif %}
- <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td>
+ {%- if entry.package %}<a href="./{{ entry.package|e }}:{{ entry.architecture|e }}"><span class="binary-package">{{ entry.package|e }}:{{ entry.architecture|e}}</span></a>{% else %}self{% endif %}
+ <a href="../compare/{{ package|e }}:{{ architecture|e }}/{{ entry.package|default(package, true)|e }}:{{ entry.architecture|default(architecture, true)|e }}">compare</a></td>
<td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td>
<td>{{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr>
{%- endfor -%}
diff --git a/dedup/templates/compare.html b/dedup/templates/compare.html
index f78e80f..7d4564c 100644
--- a/dedup/templates/compare.html
+++ b/dedup/templates/compare.html
@@ -1,14 +1,12 @@
{% extends "base.html" %}
-{% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%}
+{% block title %}sharing between {{ details1.package|e }}:{{ details1.architecture|e }} and {{ details2.package|e }}:{{ details2.architecture|e }}{% endblock%}
{% block content %}
-<h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1>
-<p>Version of {{ details1.package|e }}: {{ details1.version|e }}</p>
-<p>Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}</p>
-{%- if details1.package != details2.package -%}
-<p>Version of {{ details2.package|e }}: {{ details2.version|e }}</p>
-<p>Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}</p>
+<h1><a href="../../binary/{{ details1.package|e }}:{{ details1.architecture|e }}">{{ details1.package|e }}:{{ details1.architecture|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}:{{ details2.architecture|e }}">{{ details2.package|e }}:{{ details2.architecture|e }}</a></h1>
+<p>Version of {{ details1.package|e }}:{{ details1.architecture|e }}: {{ details1.version|e }}</p>
+{%- if details1.package != details2.package or details1.architecture != details2.architecture -%}
+<p>Version of {{ details2.package|e }}:{{ details2.architecture|e }}: {{ details2.version|e }}</p>
{%- endif -%}
-<table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr>
+<table border='1'><tr><th colspan="2">{{ details1.package|e }}:{{ details1.architecture|e }}</th><th colspan="2">{{ details2.package|e }}:{{ details2.architecture|e }}</th></tr>
<tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr>
{%- for entry in shared -%}
<tr><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }}</td><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>
diff --git a/dedup/templates/hash.html b/dedup/templates/hash.html
index 7141f96..f700309 100644
--- a/dedup/templates/hash.html
+++ b/dedup/templates/hash.html
@@ -4,7 +4,7 @@
<h1>{{ function|e }} {{ hashvalue|e }}</h1>
<table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr>
{%- for entry in entries -%}
- <tr><td><a href="../../binary/{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a></td>
+ <tr><td><a href="../../binary/{{ entry.package|e }}:{{ entry.architecture|e }}"><span class="binary-package">{{ entry.package|e }}:{{ entry.architecture|e }}</span></a></td>
<td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|filesizeformat }}</td>
<td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr>
{%- endfor -%}
diff --git a/readyaml.py b/readyaml.py
index 2ef9a3b..6940f94 100755
--- a/readyaml.py
+++ b/readyaml.py
@@ -15,8 +15,8 @@ def readyaml(db, stream):
gen = yaml.safe_load_all(stream)
metadata = next(gen)
package = metadata["package"]
- cur.execute("SELECT id, version FROM package WHERE name = ?;",
- (package,))
+ cur.execute("SELECT id, version FROM package WHERE name = ? AND architecture = ?;",
+ (package, metadata["architecture"]))
row = cur.fetchone()
if row:
pid, version = row
@@ -31,12 +31,13 @@ def readyaml(db, stream):
if pid is not None:
cur.execute("DELETE FROM content WHERE pid = ?;", (pid,))
cur.execute("DELETE FROM dependency WHERE pid = ?;", (pid,))
- cur.execute("UPDATE package SET version = ?, architecture = ?, source = ? WHERE id = ?;",
- (metadata["version"], metadata["architecture"], metadata["source"], pid))
+ cur.execute("UPDATE package SET version = ?, source = ?, multiarch = ? WHERE id = ?;",
+ (metadata["version"], metadata["source"],
+ metadata.get("multiarch"), pid))
else:
- cur.execute("INSERT INTO package (name, version, architecture, source) VALUES (?, ?, ?, ?);",
+ cur.execute("INSERT INTO package (name, version, architecture, source, multiarch) VALUES (?, ?, ?, ?, ?);",
(package, metadata["version"], metadata["architecture"],
- metadata["source"]))
+ metadata["source"], metadata.get("multiarch")))
pid = cur.lastrowid
cur.executemany("INSERT INTO dependency (pid, required) VALUES (?, ?);",
((pid, dep) for dep in metadata["depends"]))
diff --git a/schema.sql b/schema.sql
index 99ae7e5..3a49daf 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,10 +1,12 @@
CREATE TABLE package ( -- binary Debian packages
id INTEGER PRIMARY KEY,
- name TEXT UNIQUE NOT NULL,
+ name TEXT NOT NULL,
-- binary package name
version TEXT, -- Debian version
architecture TEXT,
- source TEXT); -- name of the source package it was built from
+ source TEXT, -- name of the source package it was built from
+ multiarch TEXT, -- value of the Multi-Arch field
+ UNIQUE (name, architecture));
CREATE TABLE content ( -- a file contained in a binary package
id INTEGER PRIMARY KEY,
diff --git a/webapp.py b/webapp.py
index 9612c38..e173d60 100755
--- a/webapp.py
+++ b/webapp.py
@@ -106,25 +106,28 @@ class Application(object):
except HTTPException as e:
return e
- def get_details(self, package):
+ def guess_package(self, package):
with contextlib.closing(self.db.cursor()) as cur:
- cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
+ cur.execute("SELECT architecture, id FROM package WHERE name = ?;",
(package,))
+ ret = dict(cur.fetchall())
+ if not ret:
+ raise NotFound()
+ return ret
+
+ def get_details(self, package, architecture):
+ with contextlib.closing(self.db.cursor()) as cur:
+ cur.execute("SELECT package.id, package.version, count(content.filename), sum(content.size) FROM package JOIN content ON package.id = content.pid WHERE name = ? AND architecture = ? GROUP BY package.id;",
+ (package, architecture))
row = cur.fetchone()
- if not row:
- raise NotFound()
- pid, version, architecture = row
- details = dict(pid=pid,
- package=package,
- version=version,
- architecture=architecture)
- cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;",
- (pid,))
- num_files, total_size = cur.fetchone()
+ if not row:
+ raise NotFound()
+ pid, version, num_files, total_size = row
if total_size is None:
total_size = 0
- details.update(dict(num_files=num_files, total_size=total_size))
- return details
+ return dict(pid=pid, package=package, version=version,
+ architecture=architecture, num_files=num_files,
+ total_size=total_size)
def get_dependencies(self, pid):
with contextlib.closing(self.db.cursor()) as cur:
@@ -135,19 +138,27 @@ class Application(object):
def cached_sharedstats(self, pid):
sharedstats = {}
with contextlib.closing(self.db.cursor()) as cur:
- cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
+ cur.execute("SELECT pid2, package.name, package.architecture, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
(pid,))
- for pid2, package2, func1, func2, files, size in fetchiter(cur):
+ for pid2, package2, architecture2, func1, func2, files, size in fetchiter(cur):
curstats = sharedstats.setdefault(
function_combination(func1, func2), list())
if pid2 == pid:
package2 = None
- curstats.append(dict(package=package2, duplicate=files,
- savable=size))
+ architecture2 = None
+ curstats.append(dict(package=package2,
+ architecture=architecture2,
+ duplicate=files, savable=size))
return sharedstats
def show_package(self, package):
- params = self.get_details(package)
+ if ':' in package:
+ package, architecture = package.split(':', 1)
+ else:
+ architecture = min(self.guess_package(package))
+ raise InternalRedirect("/binary/%s:%s" % (package, architecture),
+ code=302)
+ params = self.get_details(package, architecture)
params["dependencies"] = self.get_dependencies(params["pid"])
params["shared"] = self.cached_sharedstats(params["pid"])
params["urlroot"] = ".."
@@ -203,9 +214,24 @@ class Application(object):
yield entry
def show_detail(self, package1, package2):
- details1 = details2 = self.get_details(package1)
- if package1 != package2:
- details2 = self.get_details(package2)
+ guessed = False
+ if ':' in package1:
+ package1, architecture1 = package1.split(':', 1)
+ else:
+ architecture1 = min(self.guess_package(package1))
+ guessed = True
+ if ':' in package2:
+ package2, architecture2 = package2.split(':', 1)
+ else:
+ architecture2 = min(self.guess_package(package2))
+ guessed = True
+ if guessed:
+ raise InternalRedirect("/compare/%s:%s/%s:%s" %
+ (package1, architecture1, package2,
+ architecture2), code=302)
+ details1 = details2 = self.get_details(package1, architecture1)
+ if package1 != package2 or architecture1 != architecture2:
+ details2 = self.get_details(package2, architecture2)
shared = self.compute_comparison(details1["pid"], details2["pid"])
params = dict(
@@ -217,11 +243,12 @@ class Application(object):
def show_hash(self, function, hashvalue):
with contextlib.closing(self.db.cursor()) as cur:
- cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
+ cur.execute("SELECT package.name, package.architecture, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
(function, hashvalue,))
- entries = [dict(package=package, filename=filename, size=size,
- function=otherfunc)
- for package, filename, size, otherfunc in fetchiter(cur)]
+ entries = [dict(package=package, architecture=architecture,
+ filename=filename, size=size, function=otherfunc)
+ for package, architecture, filename, size, otherfunc
+ in fetchiter(cur)]
if not entries:
# Assumption: '~' serves as an infinite character larger than
# any other character in the hash column.