diff options
-rw-r--r-- | COPYING | 5 | ||||
-rwxr-xr-x | autoimport.py | 4 | ||||
-rw-r--r-- | dedup/arreader.py | 2 | ||||
-rw-r--r-- | dedup/compression.py | 15 | ||||
-rw-r--r-- | dedup/debpkg.py | 6 | ||||
-rw-r--r-- | dedup/hashing.py | 56 | ||||
-rw-r--r-- | dedup/image.py | 2 | ||||
-rw-r--r-- | dedup/templates/index.html | 2 | ||||
-rwxr-xr-x | importpkg.py | 70 | ||||
-rw-r--r-- | schema.sql | 81 | ||||
-rwxr-xr-x | update_sharing.py | 20 | ||||
-rwxr-xr-x | webapp.py | 125 |
12 files changed, 268 insertions, 120 deletions
@@ -1,10 +1,11 @@ Main author: -Copyright (C) 2013 Helmut Grohne <helmut@subdivi.de> +Copyright (C) 2013-2014 Helmut Grohne <helmut@subdivi.de> -Contributor: +Contributors: Copyright (C) 2013 Jan Lühr (javascript for webapp.py) +Copyright (C) 2014 Guillem Jover <guillem@debian.org> All rights reserved. diff --git a/autoimport.py b/autoimport.py index c7eb2f5..5c2c9be 100755 --- a/autoimport.py +++ b/autoimport.py @@ -61,7 +61,7 @@ def process_pkg(key, pkgdict, outpath): importcmd = ["python", "importpkg.py"] if "sha256hash" in pkgdict: importcmd.extend(["-H", pkgdict["sha256hash"]]) - if filename.startswith("http://"): + if filename.startswith(("http://", "https://", "ftp://", "file://")): with open(outpath, "w") as outp: dl = subprocess.Popen(["curl", "-s", filename], stdout=subprocess.PIPE, close_fds=True) @@ -96,7 +96,7 @@ def main(): pkgs = {} for d in args: print("processing %s" % d) - if d.startswith("http://"): + if d.startswith(("http://", "https://", "ftp://", "file://")): process_http(pkgs, d) elif os.path.isdir(d): process_dir(pkgs, d) diff --git a/dedup/arreader.py b/dedup/arreader.py index d74ae37..e53efd9 100644 --- a/dedup/arreader.py +++ b/dedup/arreader.py @@ -45,7 +45,7 @@ class ArReader(object): if not file_header: raise EOFError("end of archive found") parts = struct.unpack("16s 12s 6s 6s 8s 10s 2s", file_header) - parts = [p.rstrip(b" ") for p in parts] + parts = [p.rstrip(b"/ ") for p in parts] if parts.pop() != self.file_magic: raise ValueError("ar file header not found") self.remaining = int(parts[5]) diff --git a/dedup/compression.py b/dedup/compression.py index 869c49f..4ce258c 100644 --- a/dedup/compression.py +++ b/dedup/compression.py @@ -5,8 +5,11 @@ class GzipDecompressor(object): """An interface to gzip which is similar to bz2.BZ2Decompressor and lzma.LZMADecompressor.""" def __init__(self): + self.sawheader = False self.inbuffer = b"" self.decompressor = None + self.crc = 0 + self.size = 0 def decompress(self, data): """ @@ -16,6 +19,8 @@ class GzipDecompressor(object): while True: if self.decompressor: data = self.decompressor.decompress(data) + self.crc = zlib.crc32(data, self.crc) + self.size += len(data) unused_data = self.decompressor.unused_data if not unused_data: return data @@ -45,13 +50,20 @@ class GzipDecompressor(object): return b"" data = self.inbuffer[skip:] self.inbuffer = b"" + self.sawheader = True self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS) @property def unused_data(self): if self.decompressor: return self.decompressor.unused_data + elif not self.sawheader: + return self.inbuffer else: + expect = struct.pack("<ll", self.crc, self.size) + if self.inbuffer.startswith(expect) and \ + self.inbuffer[len(expect):].replace("\0", "") == "": + return b"" return self.inbuffer def flush(self): @@ -67,6 +79,9 @@ class GzipDecompressor(object): new.inbuffer = self.inbuffer if self.decompressor: new.decompressor = self.decompressor.copy() + new.sawheader = self.sawheader + new.crc = self.crc + new.size = self.size return new class DecompressedStream(object): diff --git a/dedup/debpkg.py b/dedup/debpkg.py index 6d857ba..cd8616f 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -3,7 +3,7 @@ from debian import deb822 from dedup.hashing import hash_file def process_control(control_contents): - """Parses the contents of a control file from a control.tar.gz of a Debian + """Parses the contents of a control file from a control.tar of a Debian package and returns a dictionary containing the fields relevant to dedup. @type control_contents: bytes @rtype: {str: object} @@ -16,8 +16,8 @@ def process_control(control_contents): source = package version = control["version"].encode("ascii") architecture = control["architecture"].encode("ascii") - - depends = set(dep[0]["name"].encode("ascii") + # deb822 currently returns :any dependencies raw. see #670679 + depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii") for dep in control.relations.get("depends", ()) if len(dep) == 1) ret = dict(package=package, source=source, version=version, diff --git a/dedup/hashing.py b/dedup/hashing.py index 002eda8..a8a46c7 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,3 +1,5 @@ +import itertools + class HashBlacklist(object): """Turn a hashlib-like object into a hash that returns None for some blacklisted hashes instead of the real hash value. @@ -29,6 +31,54 @@ class HashBlacklist(object): def copy(self): return HashBlacklist(self.hashobj.copy(), self.blacklist) +class HashBlacklistContent(object): + """Turn a hashlib-like object into a hash that returns None for some + blacklisted content instead of the real hash value. Unlike HashBlacklist, + not the output of the hash is considered, but its input.""" + + def __init__(self, hashobj, blacklist=(), maxlen=None): + """ + @param hashobj: a hashlib-like object + @param blacklist: an object providing __contains__. + hash inputs which are contained in the blacklist + are turned into None values + @param maxlen: the maximum length of a blacklisted input. + Defaults to max(map(len, blacklist)), so if it is absent, + the blacklist must support iteration. + """ + self.hashobj = hashobj + self.blacklist = blacklist + if maxlen is None: + # the chain avoids passing the empty sequence to max + maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist))) + self.maxlen = maxlen + self.stored = "" + + @property + def name(self): + return self.hashobj.name + + def update(self, data): + if self.stored is not None: + self.stored += data + if len(self.stored) > self.maxlen: + self.stored = None + self.hashobj.update(data) + + def digest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.digest() + + def hexdigest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.hexdigest() + + def copy(self): + return HashBlacklistContent(self.hashobj.copy(), self.blacklist, + self.maxlen) + class DecompressedHash(object): """Apply a decompression function before the hash. This class provides the hashlib interface (update, hexdigest, copy) excluding digest and name.""" @@ -49,9 +99,13 @@ class DecompressedHash(object): def hexdigest(self): if not hasattr(self.decompressor, "flush"): + if self.decompressor.unused_data: + raise ValueError("decompressor did not consume all data") return self.hashobj.hexdigest() tmpdecomp = self.decompressor.copy() data = tmpdecomp.flush() + if tmpdecomp.unused_data: + raise ValueError("decompressor did not consume all data") tmphash = self.hashobj.copy() tmphash.update(data) return tmphash.hexdigest() @@ -61,7 +115,7 @@ class DecompressedHash(object): class SuppressingHash(object): """A hash that silences exceptions from the update and hexdigest methods of - a hashlib-like object. If an exception has occured, hexdigest always + a hashlib-like object. If an exception has occurred, hexdigest always returns None.""" def __init__(self, hashobj, exceptions=()): """ diff --git a/dedup/image.py b/dedup/image.py index c1f2de0..ef17989 100644 --- a/dedup/image.py +++ b/dedup/image.py @@ -4,7 +4,7 @@ import struct import PIL.Image class ImageHash(object): - """A hash on the contents of an image datat type supported by PIL. This + """A hash on the contents of an image data type supported by PIL. This disregards mode, depth and meta information. Note that due to limitations in PIL and the image format (interlacing) the full contents are stored and decoded in hexdigest.""" diff --git a/dedup/templates/index.html b/dedup/templates/index.html index 7c9000f..169027e 100644 --- a/dedup/templates/index.html +++ b/dedup/templates/index.html @@ -28,7 +28,7 @@ {% block content %} <h1>Debian duplication detector</h1> <ul> -<li>To inspect a particlar binary package, go to <pre>binary/<packagename></pre> Example: <a href="binary/git">binary/git</a> +<li>To inspect a particular binary package, go to <pre>binary/<packagename></pre> Example: <a href="binary/git">binary/git</a> <div style="display:none" id="form_div"><fieldset> <legend>Inspect package</legend> <noscript><b>This form is dysfunctional when javascript is not enabled</b></noscript> diff --git a/importpkg.py b/importpkg.py index 54f6181..06d9da4 100755 --- a/importpkg.py +++ b/importpkg.py @@ -1,7 +1,7 @@ #!/usr/bin/python -"""This tool reads a debian package from stdin and emits a yaml stream on +"""This tool reads a Debian package from stdin and emits a yaml stream on stdout. It does not access a database. Therefore it can be run in parallel and -on multiple machines. The generated yaml conatins multiple documents. The first +on multiple machines. The generated yaml contains multiple documents. The first document contains package metadata. Then a document is emitted for each file. And finally a document consisting of the string "commit" is emitted.""" @@ -16,26 +16,21 @@ import yaml from dedup.arreader import ArReader from dedup.debpkg import process_control, get_tar_hashes -from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ - HashedStream +from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ + HashBlacklistContent from dedup.compression import GzipDecompressor, DecompressedStream from dedup.image import GIFHash, PNGHash -boring_sha512_hashes = set(( - # "" - "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", - # "\n" - "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09")) +boring_content = set(("", "\n")) def sha512_nontrivial(): - return HashBlacklist(hashlib.sha512(), boring_sha512_hashes) + return HashBlacklistContent(hashlib.sha512(), boring_content) def gziphash(): hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) hashobj.name = "gzip_sha512" - # don't blacklist boring hashes for gzip to get gzip issues right - return hashobj + return HashBlacklistContent(hashobj, boring_content) def pnghash(): hashobj = PNGHash(hashlib.sha512()) @@ -49,6 +44,15 @@ def gifhash(): hashobj.name = "gif_sha512" return hashobj +def decompress_tar(filelike, extension): + if extension in (".lzma", ".xz"): + filelike = DecompressedStream(filelike, lzma.LZMADecompressor()) + extension = "" + if extension not in ("", ".gz", ".bz2"): + raise ValueError("unknown compression format with extension %r" % + extension) + return tarfile.open(fileobj=filelike, mode="r|" + extension[1:]) + def process_package(filelike, hash_functions): af = ArReader(filelike) af.read_magic() @@ -58,11 +62,11 @@ def process_package(filelike, hash_functions): name = af.read_entry() except EOFError: raise ValueError("data.tar not found") - if name == "control.tar.gz": + if name.startswith("control.tar"): if state != "start": - raise ValueError("unexpected control.tar.gz") + raise ValueError("unexpected control.tar") state = "control" - tf = tarfile.open(fileobj=af, mode="r|gz") + tf = decompress_tar(af, name[11:]) for elem in tf: if elem.name != "./control": continue @@ -72,28 +76,20 @@ def process_package(filelike, hash_functions): yield process_control(tf.extractfile(elem).read()) break continue - elif name == "data.tar.gz": - tf = tarfile.open(fileobj=af, mode="r|gz") - elif name == "data.tar.bz2": - tf = tarfile.open(fileobj=af, mode="r|bz2") - elif name == "data.tar.xz": - zf = DecompressedStream(af, lzma.LZMADecompressor()) - tf = tarfile.open(fileobj=zf, mode="r|") - elif name == "data.tar": - tf = tarfile.open(fileobj=af, mode="r|") - else: - continue - if state != "control_file": - raise ValueError("missing control file") - for name, size, hashes in get_tar_hashes(tf, hash_functions): - try: - name = name.decode("utf8") - except UnicodeDecodeError: - print("warning: skipping filename with encoding error") - continue # skip files with non-utf8 encoding for now - yield dict(name=name, size=size, hashes=hashes) - yield "commit" - break + elif name.startswith("data.tar"): + if state != "control_file": + raise ValueError("missing control file") + state = "data" + tf = decompress_tar(af, name[8:]) + for name, size, hashes in get_tar_hashes(tf, hash_functions): + try: + name = name.decode("utf8") + except UnicodeDecodeError: + print("warning: skipping filename with encoding error") + continue # skip files with non-utf8 encoding for now + yield dict(name=name, size=size, hashes=hashes) + yield "commit" + break def process_package_with_hash(filelike, hash_functions, sha256hash): hstream = HashedStream(filelike, hashlib.sha256()) @@ -1,20 +1,81 @@ -CREATE TABLE package (id INTEGER PRIMARY KEY, name TEXT, version TEXT, architecture TEXT, source TEXT, multiarch TEXT, UNIQUE (name, architecture)); -CREATE TABLE content (id INTEGER PRIMARY KEY, pid INTEGER, filename TEXT, size INTEGER, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); -CREATE TABLE function (id INTEGER PRIMARY KEY, name TEXT UNIQUE NOT NULL, eqclass INTEGER); -INSERT INTO function (name, eqclass) VALUES ("sha512", 1), ("gzip_sha512", 1), ("png_sha512", 2), ("gif_sha512", 2); -CREATE TABLE hash (cid INTEGER, fid INTEGER NOT NULL, hash TEXT, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE, FOREIGN KEY (fid) REFERENCES function(id)); -CREATE TABLE dependency (pid INTEGER, required TEXT, FOREIGN KEY (pid) REFERENCES package(id) ON DELETE CASCADE); +CREATE TABLE package ( -- binary Debian packages + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + -- binary package name + version TEXT, -- Debian version + architecture TEXT, + source TEXT, -- name of the source package it was built from + multiarch TEXT, -- value of the Multi-Arch field + UNIQUE (name, architecture)); + +CREATE TABLE content ( -- a file contained in a binary package + id INTEGER PRIMARY KEY, + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + -- which package the file is contained in + filename TEXT NOT NULL, + size INTEGER NOT NULL); + +CREATE TABLE function ( -- hash functions + id INTEGER PRIMARY KEY, + name TEXT UNIQUE NOT NULL, + eqclass INTEGER); + -- hash values of different hash functions are comparable if they share + -- an eqclass + +INSERT INTO function (id, name, eqclass) VALUES + (1, 'sha512', 1), + (2, 'gzip_sha512', 1), + -- decompress a gzip file, then hash + (3, 'png_sha512', 2), + -- decompress a PNG file, hash RGBA image contents + dimension + (4, 'gif_sha512', 2); + -- decompress a GIF file, hash RGBA image contents + dimension + +CREATE TABLE hash ( -- hash values of files in binary packages + cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE, + -- which file has been hashed + fid INTEGER NOT NULL REFERENCES function(id), + -- using which function + hash TEXT NOT NULL); -- textual hash value + +CREATE TABLE dependency ( -- binary package dependencies + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + -- the package that carries a Depends: header + required TEXT NOT NULL); + -- the name of a package that is depended upon without alternative + CREATE INDEX content_package_size_index ON content (pid, size); CREATE INDEX hash_cid_index ON hash (cid); CREATE INDEX hash_hash_index ON hash (hash); +-- All tables below this line can be recomputed from the tables above. +-- Recomputation is done using the update_sharing.py script. + +-- The sharing table caches two values for each pair of packages pid1, pid2 and +-- pair of hash functions fid1, fid2: +-- * files is the number of files in pid1 that could be eliminated by reusing +-- files from pid2. Since the functions may be different, this may mean +-- replacing a compressed file with an uncompressed one. +-- * size is the number of bytes that would be freed by doing the above. +-- Note: If pid1=pid2, one copy of each file must be preserved. CREATE TABLE sharing ( pid1 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, pid2 INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, fid1 INTEGER NOT NULL REFERENCES function(id), fid2 INTEGER NOT NULL REFERENCES function(id), - files INTEGER, - size INTEGER); + files INTEGER NOT NULL, + size INTEGER NOT NULL); CREATE INDEX sharing_insert_index ON sharing (pid1, pid2, fid1, fid2); -CREATE TABLE duplicate (cid INTEGER PRIMARY KEY, FOREIGN KEY (cid) REFERENCES content(id) ON DELETE CASCADE); -CREATE TABLE issue (cid INTEGER REFERENCES content(id) ON DELETE CASCADE, issue TEXT); + +-- The duplicate table caches all files that have any non-unique hash value. +-- It is used in webapp.py to speed up one query, but could be dropped +-- otherwise. +CREATE TABLE duplicate ( + cid INTEGER PRIMARY KEY REFERENCES content(id) ON DELETE CASCADE); + +-- The issue table contains auxillary notices per file. For example, when a +-- filename ends in ".gz", but is not gzip decompressible (has no gzip_sha512 +-- hash), it is recorded here. +CREATE TABLE issue ( + cid INTEGER NOT NULL REFERENCES content(id) ON DELETE CASCADE, + issue TEXT NOT NULL); -- a human readable comment on the file diff --git a/update_sharing.py b/update_sharing.py index 1ff0fd8..ca6890b 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -47,14 +47,20 @@ def main(db): readcur = db.cursor() readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): - cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", + cur.execute("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = ?;", (hashvalue,)) - rows = cur.fetchall() - print("processing hash %s with %d entries" % (hashvalue, len(rows))) - pkgdict = compute_pkgdict(rows) - cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", - [(row[1],) for row in rows]) - process_pkgdict(cur, pkgdict) + rowdict = dict() + for row in cur.fetchall(): + rowdict.setdefault(row[0], []).append(row[1:]) + for eqclass, rows in rowdict.items(): + if len(rows) < 2: + print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass)) + continue + print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows))) + pkgdict = compute_pkgdict(rows) + cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", + [(row[1],) for row in rows]) + process_pkgdict(cur, pkgdict) cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');") cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';") cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';") @@ -1,5 +1,6 @@ #!/usr/bin/python +import contextlib import datetime import optparse import sqlite3 @@ -7,7 +8,8 @@ from wsgiref.simple_server import make_server import jinja2 from werkzeug.exceptions import HTTPException, NotFound -from werkzeug.routing import Map, Rule, RequestRedirect +from werkzeug.routing import Map, Rule +from werkzeug.utils import redirect from werkzeug.wrappers import Request, Response from werkzeug.wsgi import SharedDataMiddleware @@ -61,9 +63,10 @@ def html_response(unicode_iterator, max_age=24 * 60 * 60): return resp class InternalRedirect(Exception): - def __init__(self, target): + def __init__(self, target, code=301): Exception.__init__(self) self.target = target + self.code = code class Application(object): def __init__(self, db): @@ -93,33 +96,33 @@ class Application(object): return self.show_hash(args["function"], args["hashvalue"]) elif endpoint == "index": if not request.environ["PATH_INFO"]: - raise RequestRedirect(request.environ["SCRIPT_NAME"] + "/") + raise InternalRedirect("/") return html_response(index_template.render(dict(urlroot=""))) elif endpoint == "source": return self.show_source(args["package"]) raise NotFound() except InternalRedirect as r: - return RequestRedirect(request.environ["SCRIPT_NAME"] + r.target) + return redirect(request.environ["SCRIPT_NAME"] + r.target, r.code) except HTTPException as e: return e def guess_package(self, package): - cur = self.db.cursor() - cur.execute("SELECT architecture, id FROM package WHERE name = ?;", - (package,)) - ret = dict(cur.fetchall()) + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT architecture, id FROM package WHERE name = ?;", + (package,)) + ret = dict(cur.fetchall()) if not ret: raise NotFound() return ret def get_details(self, package, architecture): - cur = self.db.cursor() - cur.execute("SELECT package.id, package.version, package.architecture, count(content.filename), sum(content.size) FROM package JOIN content ON package.id = content.pid WHERE name = ? AND architecture = ? GROUP BY package.id;", + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT package.id, package.version, count(content.filename), sum(content.size) FROM package JOIN content ON package.id = content.pid WHERE name = ? AND architecture = ? GROUP BY package.id;", (package, architecture)) - row = cur.fetchone() + row = cur.fetchone() if not row: raise NotFound() - pid, version, architecture, num_files, total_size = row + pid, version, num_files, total_size = row if total_size is None: total_size = 0 return dict(pid=pid, package=package, version=version, @@ -127,23 +130,25 @@ class Application(object): total_size=total_size) def get_dependencies(self, pid): - cur = self.db.cursor() - cur.execute("SELECT required FROM dependency WHERE pid = ?;", - (pid,)) - return set(row[0] for row in fetchiter(cur)) + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT required FROM dependency WHERE pid = ?;", + (pid,)) + return set(row[0] for row in fetchiter(cur)) def cached_sharedstats(self, pid): - cur = self.db.cursor() sharedstats = {} - cur.execute("SELECT pid2, package.name, package.architecture, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", - (pid,)) - for pid2, package2, architecture2, func1, func2, files, size in fetchiter(cur): - curstats = sharedstats.setdefault( - function_combination(func1, func2), list()) - if pid2 == pid: - package2 = None - architecture2 = None - curstats.append(dict(package=package2, architecture=architecture2, duplicate=files, savable=size)) + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT pid2, package.name, package.architecture, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", + (pid,)) + for pid2, package2, architecture2, func1, func2, files, size in fetchiter(cur): + curstats = sharedstats.setdefault( + function_combination(func1, func2), list()) + if pid2 == pid: + package2 = None + architecture2 = None + curstats.append(dict(package=package2, + architecture=architecture2, + duplicate=files, savable=size)) return sharedstats def show_package(self, package): @@ -151,7 +156,8 @@ class Application(object): package, architecture = package.split(':', 1) else: architecture = min(self.guess_package(package)) - raise InternalRedirect("/binary/%s:%s" % (package, architecture)) + raise InternalRedirect("/binary/%s:%s" % (package, architecture), + code=302) params = self.get_details(package, architecture) params["dependencies"] = self.get_dependencies(params["pid"]) params["shared"] = self.cached_sharedstats(params["pid"]) @@ -164,7 +170,7 @@ class Application(object): return html_response(package_template.render(params)) def compute_comparison(self, pid1, pid2): - """Compute a sequence of comparison objects ordery by the size of the + """Compute a sequence of comparison objects ordered by the size of the object in the first package. Each element of the sequence is a dict defining the following keys: * filenames: A set of filenames in package 1 (pid1) all referring to @@ -179,6 +185,7 @@ class Application(object): cursize = -1 files = dict() minmatch = 2 if pid1 == pid2 else 1 + cur2 = self.db.cursor() for cid, filename, size, hashvalue in fetchiter(cur): if cursize != size: for entry in files.values(): @@ -194,13 +201,12 @@ class Application(object): entry = dict(filenames=set((filename,)), size=size, matches={}) files[hashvalue] = entry - cur2 = self.db.cursor() - cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;", + cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;", (cid, pid2)) for func1, hashvalue, func2, filename in fetchiter(cur2): entry["matches"].setdefault(filename, {})[func1, func2] = \ hashvalue - cur2.close() + cur2.close() cur.close() for entry in files.values(): @@ -222,7 +228,7 @@ class Application(object): if guessed: raise InternalRedirect("/compare/%s:%s/%s:%s" % (package1, architecture1, package2, - architecture2)) + architecture2), code=302) details1 = details2 = self.get_details(package1, architecture1) if package1 != package2 or architecture1 != architecture2: details2 = self.get_details(package2, architecture2) @@ -236,34 +242,43 @@ class Application(object): return html_response(detail_template.stream(params)) def show_hash(self, function, hashvalue): - cur = self.db.cursor() - cur.execute("SELECT package.name, package.architecture, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", - (function, hashvalue,)) - entries = [dict(package=package, architecture=architecture, - filename=filename, size=size, function=otherfunc) - for package, architecture, filename, size, otherfunc in fetchiter(cur)] - if not entries: - raise NotFound() + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT package.name, package.architecture, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", + (function, hashvalue,)) + entries = [dict(package=package, architecture=architecture, + filename=filename, size=size, function=otherfunc) + for package, architecture, filename, size, otherfunc + in fetchiter(cur)] + if not entries: + # Assumption: '~' serves as an infinite character larger than + # any other character in the hash column. + cur.execute("SELECT DISTINCT hash.hash FROM hash JOIN function ON hash.fid = function.id WHERE function.name = ? AND hash.hash >= ? AND hash.hash <= ? LIMIT 2;", + (function, hashvalue, hashvalue + '~')) + values = cur.fetchall() + if len(values) == 1: + raise InternalRedirect("/hash/%s/%s" % + (function, values[0][0]), 302) + raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries, urlroot="../..") return html_response(hash_template.render(params)) def show_source(self, package): - cur = self.db.cursor() - cur.execute("SELECT name FROM package WHERE source = ?;", - (package,)) - binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) - if not binpkgs: - raise NotFound - cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;", - (package,)) - for binary, otherbin, func1, func2, files, size in fetchiter(cur): - entry = dict(package=otherbin, - funccomb=function_combination(func1, func2), - duplicate=files, savable=size) - oldentry = binpkgs.get(binary) - if not (oldentry and oldentry["savable"] >= size): - binpkgs[binary] = entry + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT name FROM package WHERE source = ?;", + (package,)) + binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) + if not binpkgs: + raise NotFound + cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;", + (package,)) + for binary, otherbin, func1, func2, files, size in fetchiter(cur): + entry = dict(package=otherbin, + funccomb=function_combination(func1, func2), + duplicate=files, savable=size) + oldentry = binpkgs.get(binary) + if not (oldentry and oldentry["savable"] >= size): + binpkgs[binary] = entry params = dict(source=package, packages=binpkgs, urlroot="..") return html_response(source_template.render(params)) |