summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xautoimport.py61
-rw-r--r--dedup/templates/binary.html9
-rw-r--r--dedup/templates/compare.html14
-rw-r--r--dedup/templates/hash.html2
-rwxr-xr-xreadyaml.py8
-rw-r--r--schema.sql2
-rwxr-xr-xwebapp.py82
7 files changed, 105 insertions, 73 deletions
diff --git a/autoimport.py b/autoimport.py
index d44c012..c7eb2f5 100755
--- a/autoimport.py
+++ b/autoimport.py
@@ -26,26 +26,27 @@ def process_http(pkgs, url):
pkglist = io.BytesIO(pkglist)
pkglist = deb822.Packages.iter_paragraphs(pkglist)
for pkg in pkglist:
- name = pkg["Package"]
- if name in pkgs and \
- version_compare(pkgs[name]["version"], pkg["Version"]) > 0:
+ key = (pkg["Package"], pkg["Architecture"])
+ if key in pkgs and \
+ version_compare(pkgs[key]["version"], pkg["Version"]) > 0:
continue
- pkgs[name] = dict(version=pkg["Version"],
- filename="%s/%s" % (url, pkg["Filename"]),
- sha256hash=pkg["SHA256"])
+ pkgs[key] = dict(version=pkg["Version"],
+ filename="%s/%s" % (url, pkg["Filename"]),
+ sha256hash=pkg["SHA256"])
def process_file(pkgs, filename):
base = os.path.basename(filename)
if not base.endswith(".deb"):
raise ValueError("filename does not end in .deb")
- parts = base.split("_")
+ parts = base[:-4].split("_")
if len(parts) != 3:
raise ValueError("filename not in form name_version_arch.deb")
- name, version, _ = parts
+ name, version, architecture = parts
+ key = (name, architecture)
version = urllib.unquote(version)
- if name in pkgs and version_compare(pkgs[name]["version"], version) > 0:
+ if key in pkgs and version_compare(pkgs[key]["version"], version) > 0:
return
- pkgs[name] = dict(version=version, filename=filename)
+ pkgs[key] = dict(version=version, filename=filename)
def process_dir(pkgs, d):
for entry in os.listdir(d):
@@ -54,7 +55,7 @@ def process_dir(pkgs, d):
except ValueError:
pass
-def process_pkg(name, pkgdict, outpath):
+def process_pkg(key, pkgdict, outpath):
filename = pkgdict["filename"]
print("importing %s" % filename)
importcmd = ["python", "importpkg.py"]
@@ -75,7 +76,7 @@ def process_pkg(name, pkgdict, outpath):
with open(outpath, "w") as outp:
subprocess.check_call(importcmd, stdin=inp, stdout=outp,
close_fds=True)
- print("preprocessed %s" % name)
+ print("preprocessed %s:%s" % key)
def main():
parser = optparse.OptionParser()
@@ -103,42 +104,44 @@ def main():
process_file(pkgs, d)
print("reading database")
- cur.execute("SELECT name, version FROM package;")
- knownpkgs = dict((row[0], row[1]) for row in cur.fetchall())
+ cur.execute("SELECT name, architecture, version FROM package;")
+ knownpkgs = dict(((row[0], row[1]), row[2]) for row in cur.fetchall())
distpkgs = set(pkgs.keys())
if options.new:
- for name in distpkgs:
- if name in knownpkgs and version_compare(pkgs[name]["version"],
- knownpkgs[name]) <= 0:
- del pkgs[name]
+ for key in distpkgs:
+ if key in knownpkgs and version_compare(pkgs[key]["version"],
+ knownpkgs[key]) <= 0:
+ del pkgs[key]
knownpkgs = set(knownpkgs)
with e:
fs = {}
- for name, pkg in pkgs.items():
- outpath = os.path.join(tmpdir, name)
- fs[e.submit(process_pkg, name, pkg, outpath)] = name
+ for key, pkg in pkgs.items():
+ outpath = os.path.join(tmpdir, "%s_%s" % key)
+ fs[e.submit(process_pkg, key, pkg, outpath)] = key
for f in concurrent.futures.as_completed(fs.keys()):
- name = fs[f]
+ key = fs[f]
if f.exception():
- print("%s failed to import: %r" % (name, f.exception()))
+ print("%s:%s failed to import: %r" %
+ (key[0], key[1], f.exception()))
continue
- inf = os.path.join(tmpdir, name)
- print("sqlimporting %s" % name)
+ inf = os.path.join(tmpdir, "%s_%s" % key)
+ print("sqlimporting %s:%s" % key)
with open(inf) as inp:
try:
readyaml(db, inp)
except Exception as exc:
- print("%s failed sql with exception %r" % (name, exc))
+ print("%s:%s failed sql with exception %r" %
+ (key[0], key[1], exc))
else:
os.unlink(inf)
if options.prune:
delpkgs = knownpkgs - distpkgs
- print("clearing packages %s" % " ".join(delpkgs))
- cur.executemany("DELETE FROM package WHERE name = ?;",
- ((pkg,) for pkg in delpkgs))
+ print("clearing packages %s" % " ".join(map("%s:%s".__mod__, delpkgs)))
+ cur.executemany("DELETE FROM package WHERE name = ? AND architecture = ?;",
+ delpkgs)
# Tables content, dependency and sharing will also be pruned
# due to ON DELETE CASCADE clauses.
db.commit()
diff --git a/dedup/templates/binary.html b/dedup/templates/binary.html
index 46c4fa6..bf9aad8 100644
--- a/dedup/templates/binary.html
+++ b/dedup/templates/binary.html
@@ -1,6 +1,7 @@
{% extends "base.html" %}
-{% block title %}duplication of {{ package|e }}{% endblock %}
-{% block content %}<h1>{{ package|e }}</h1>
+{% block title %}duplication of {{ package|e }}:{{ architecture|e }}{% endblock %}
+{% block content %}<h1>{{ package|e }}:{{ architecture|e }}</h1>
+<p>Package: {{ package|e }}</p>
<p>Version: {{ version|e }}</p>
<p>Architecture: {{ architecture|e }}</p>
<p>Number of files: {{ num_files }}</p>
@@ -11,8 +12,8 @@
<table border='1'><tr><th>package</th><th>files shared</th><th>data shared</th></tr>
{%- for entry in sharing|sort(attribute="savable", reverse=true) -%}
<tr><td{% if not entry.package or entry.package in dependencies %} class="dependency"{% endif %}>
- {%- if entry.package %}<a href="{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a>{% else %}self{% endif %}
- <a href="../compare/{{ package|e }}/{{ entry.package|default(package, true)|e }}">compare</a></td>
+ {%- if entry.package %}<a href="./{{ entry.package|e }}:{{ entry.architecture|e }}"><span class="binary-package">{{ entry.package|e }}:{{ entry.architecture|e}}</span></a>{% else %}self{% endif %}
+ <a href="../compare/{{ package|e }}:{{ architecture|e }}/{{ entry.package|default(package, true)|e }}:{{ entry.architecture|default(architecture, true)|e }}">compare</a></td>
<td>{{ entry.duplicate }} ({{ (100 * entry.duplicate / num_files)|int }}%)</td>
<td>{{ entry.savable|filesizeformat }} ({{ (100 * entry.savable / total_size)|int }}%)</td></tr>
{%- endfor -%}
diff --git a/dedup/templates/compare.html b/dedup/templates/compare.html
index f78e80f..7d4564c 100644
--- a/dedup/templates/compare.html
+++ b/dedup/templates/compare.html
@@ -1,14 +1,12 @@
{% extends "base.html" %}
-{% block title %}sharing between {{ details1.package|e }} and {{ details2.package|e }}{% endblock%}
+{% block title %}sharing between {{ details1.package|e }}:{{ details1.architecture|e }} and {{ details2.package|e }}:{{ details2.architecture|e }}{% endblock%}
{% block content %}
-<h1><a href="../../binary/{{ details1.package|e }}">{{ details1.package|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}">{{ details2.package|e }}</a></h1>
-<p>Version of {{ details1.package|e }}: {{ details1.version|e }}</p>
-<p>Architecture of {{ details1.package|e }}: {{ details1.architecture|e }}</p>
-{%- if details1.package != details2.package -%}
-<p>Version of {{ details2.package|e }}: {{ details2.version|e }}</p>
-<p>Architecture of {{ details2.package|e }}: {{ details2.architecture|e }}</p>
+<h1><a href="../../binary/{{ details1.package|e }}:{{ details1.architecture|e }}">{{ details1.package|e }}:{{ details1.architecture|e }}</a> &lt;-&gt; <a href="../../binary/{{ details2.package|e }}:{{ details2.architecture|e }}">{{ details2.package|e }}:{{ details2.architecture|e }}</a></h1>
+<p>Version of {{ details1.package|e }}:{{ details1.architecture|e }}: {{ details1.version|e }}</p>
+{%- if details1.package != details2.package or details1.architecture != details2.architecture -%}
+<p>Version of {{ details2.package|e }}:{{ details2.architecture|e }}: {{ details2.version|e }}</p>
{%- endif -%}
-<table border='1'><tr><th colspan="2">{{ details1.package|e }}</th><th colspan="2">{{ details2.package|e }}</th></tr>
+<table border='1'><tr><th colspan="2">{{ details1.package|e }}:{{ details1.architecture|e }}</th><th colspan="2">{{ details2.package|e }}:{{ details2.architecture|e }}</th></tr>
<tr><th>size</th><th>filename</th><th>hash functions</th><th>filename</th></tr>
{%- for entry in shared -%}
<tr><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>{{ entry.size|filesizeformat }}</td><td{% if entry.matches|length > 1 %} rowspan={{ entry.matches|length }}{% endif %}>
diff --git a/dedup/templates/hash.html b/dedup/templates/hash.html
index 7141f96..f700309 100644
--- a/dedup/templates/hash.html
+++ b/dedup/templates/hash.html
@@ -4,7 +4,7 @@
<h1>{{ function|e }} {{ hashvalue|e }}</h1>
<table border='1'><tr><th>package</th><th>filename</th><th>size</th><th>different function</th></tr>
{%- for entry in entries -%}
- <tr><td><a href="../../binary/{{ entry.package|e }}"><span class="binary-package">{{ entry.package|e }}</span></a></td>
+ <tr><td><a href="../../binary/{{ entry.package|e }}:{{ entry.architecture|e }}"><span class="binary-package">{{ entry.package|e }}:{{ entry.architecture|e }}</span></a></td>
<td><span class="filename">{{ entry.filename|e }}</span></td><td>{{ entry.size|filesizeformat }}</td>
<td>{% if function != entry.function %}{{ entry.function|e }}{% endif %}</td></tr>
{%- endfor -%}
diff --git a/readyaml.py b/readyaml.py
index 2ef9a3b..cd9e5db 100755
--- a/readyaml.py
+++ b/readyaml.py
@@ -15,8 +15,8 @@ def readyaml(db, stream):
gen = yaml.safe_load_all(stream)
metadata = next(gen)
package = metadata["package"]
- cur.execute("SELECT id, version FROM package WHERE name = ?;",
- (package,))
+ cur.execute("SELECT id, version FROM package WHERE name = ? AND architecture = ?;",
+ (package, metadata["architecture"]))
row = cur.fetchone()
if row:
pid, version = row
@@ -31,8 +31,8 @@ def readyaml(db, stream):
if pid is not None:
cur.execute("DELETE FROM content WHERE pid = ?;", (pid,))
cur.execute("DELETE FROM dependency WHERE pid = ?;", (pid,))
- cur.execute("UPDATE package SET version = ?, architecture = ?, source = ? WHERE id = ?;",
- (metadata["version"], metadata["architecture"], metadata["source"], pid))
+ cur.execute("UPDATE package SET version = ?, source = ? WHERE id = ?;",
+ (metadata["version"], metadata["source"], pid))
else:
cur.execute("INSERT INTO package (name, version, architecture, source) VALUES (?, ?, ?, ?);",
(package, metadata["version"], metadata["architecture"],
diff --git a/schema.sql b/schema.sql
index 2ab7ca7..7098a71 100644
--- a/schema.sql
+++ b/schema.sql
@@ -1,4 +1,4 @@
-CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT UNIQUE, version TEXT, architecture TEXT, source TEXT);
+CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT, version TEXT, architecture TEXT, source TEXT, UNIQUE (name, architecture));
CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE);
CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER);
INSERT INTO function (name, eqclass) VALUES ("sha512", 1), ("gzip_sha512", 1), ("png_sha512", 2), ("gif_sha512", 2);
diff --git a/webapp.py b/webapp.py
index d5f076e..2819a83 100755
--- a/webapp.py
+++ b/webapp.py
@@ -60,6 +60,11 @@ def html_response(unicode_iterator, max_age=24 * 60 * 60):
resp.expires = datetime.datetime.now() + datetime.timedelta(seconds=max_age)
return resp
+class InternalRedirect(Exception):
+ def __init__(self, target):
+ Exception.__init__(self)
+ self.target = target
+
class Application(object):
def __init__(self, db):
self.db = db
@@ -83,9 +88,8 @@ class Application(object):
elif endpoint == "hash":
if args["function"] == "image_sha512":
# backwards compatibility
- raise RequestRedirect("%s/hash/png_sha512/%s" %
- (request.environ["SCRIPT_NAME"],
- args["hashvalue"]))
+ raise InternalRedirect("/hash/png_sha512/%s" %
+ args["hashvalue"])
return self.show_hash(args["function"], args["hashvalue"])
elif endpoint == "index":
if not request.environ["PATH_INFO"]:
@@ -94,28 +98,33 @@ class Application(object):
elif endpoint == "source":
return self.show_source(args["package"])
raise NotFound()
+ except InternalRedirect as r:
+ return RequestRedirect(request.environ["SCRIPT_NAME"] + r.target)
except HTTPException as e:
return e
- def get_details(self, package):
+ def guess_package(self, package):
cur = self.db.cursor()
- cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;",
+ cur.execute("SELECT architecture, id FROM package WHERE name = ?;",
(package,))
+ ret = dict(cur.fetchall())
+ if not ret:
+ raise NotFound()
+ return ret
+
+ def get_details(self, package, architecture):
+ cur = self.db.cursor()
+ cur.execute("SELECT package.id, package.version, package.architecture, count(content.filename), sum(content.size) FROM package JOIN content ON package.id = content.pid WHERE name = ? AND architecture = ? GROUP BY package.id;",
+ (package, architecture))
row = cur.fetchone()
if not row:
raise NotFound()
- pid, version, architecture = row
- details = dict(pid=pid,
- package=package,
- version=version,
- architecture=architecture)
- cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;",
- (pid,))
- num_files, total_size = cur.fetchone()
+ pid, version, architecture, num_files, total_size = row
if total_size is None:
total_size = 0
- details.update(dict(num_files=num_files, total_size=total_size))
- return details
+ return dict(pid=pid, package=package, version=version,
+ architecture=architecture, num_files=num_files,
+ total_size=total_size)
def get_dependencies(self, pid):
cur = self.db.cursor()
@@ -126,18 +135,24 @@ class Application(object):
def cached_sharedstats(self, pid):
cur = self.db.cursor()
sharedstats = {}
- cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
+ cur.execute("SELECT pid2, package.name, package.architecture, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;",
(pid,))
- for pid2, package2, func1, func2, files, size in fetchiter(cur):
+ for pid2, package2, architecture2, func1, func2, files, size in fetchiter(cur):
curstats = sharedstats.setdefault(
function_combination(func1, func2), list())
if pid2 == pid:
package2 = None
- curstats.append(dict(package=package2, duplicate=files, savable=size))
+ architecture2 = None
+ curstats.append(dict(package=package2, architecture=architecture2, duplicate=files, savable=size))
return sharedstats
def show_package(self, package):
- params = self.get_details(package)
+ if ':' in package:
+ package, architecture = package.split(':', 1)
+ else:
+ architecture = min(self.guess_package(package))
+ raise InternalRedirect("/binary/%s:%s" % (package, architecture))
+ params = self.get_details(package, architecture)
params["dependencies"] = self.get_dependencies(params["pid"])
params["shared"] = self.cached_sharedstats(params["pid"])
params["urlroot"] = ".."
@@ -193,9 +208,24 @@ class Application(object):
yield entry
def show_detail(self, package1, package2):
- details1 = details2 = self.get_details(package1)
- if package1 != package2:
- details2 = self.get_details(package2)
+ guessed = False
+ if ':' in package1:
+ package1, architecture1 = package1.split(':', 1)
+ else:
+ architecture1 = min(self.guess_package(package1))
+ guessed = True
+ if ':' in package2:
+ package2, architecture2 = package2.split(':', 1)
+ else:
+ architecture2 = min(self.guess_package(package2))
+ guessed = True
+ if guessed:
+ raise InternalRedirect("/compare/%s:%s/%s:%s" %
+ (package1, architecture1, package2,
+ architecture2))
+ details1 = details2 = self.get_details(package1, architecture1)
+ if package1 != package2 or architecture1 != architecture2:
+ details2 = self.get_details(package2, architecture2)
shared = self.compute_comparison(details1["pid"], details2["pid"])
params = dict(
@@ -207,11 +237,11 @@ class Application(object):
def show_hash(self, function, hashvalue):
cur = self.db.cursor()
- cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
+ cur.execute("SELECT package.name, package.architecture, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;",
(function, hashvalue,))
- entries = [dict(package=package, filename=filename, size=size,
- function=otherfunc)
- for package, filename, size, otherfunc in fetchiter(cur)]
+ entries = [dict(package=package, architecture=architecture,
+ filename=filename, size=size, function=otherfunc)
+ for package, architecture, filename, size, otherfunc in fetchiter(cur)]
if not entries:
raise NotFound()
params = dict(function=function, hashvalue=hashvalue, entries=entries,