From ca65a78a9ace0aeb2565df0da171727c04c33970 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Tue, 10 Sep 2013 09:39:40 +0200 Subject: webapp: close database cursors Leaking them can result in running out of available filedescriptors. --- webapp.py | 98 ++++++++++++++++++++++++++++++++------------------------------- 1 file changed, 50 insertions(+), 48 deletions(-) diff --git a/webapp.py b/webapp.py index d5f076e..4478ba0 100755 --- a/webapp.py +++ b/webapp.py @@ -1,5 +1,6 @@ #!/usr/bin/python +import contextlib import datetime import optparse import sqlite3 @@ -98,42 +99,43 @@ class Application(object): return e def get_details(self, package): - cur = self.db.cursor() - cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;", - (package,)) - row = cur.fetchone() - if not row: - raise NotFound() - pid, version, architecture = row - details = dict(pid=pid, - package=package, - version=version, - architecture=architecture) - cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;", - (pid,)) - num_files, total_size = cur.fetchone() + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT id, version, architecture FROM package WHERE name = ?;", + (package,)) + row = cur.fetchone() + if not row: + raise NotFound() + pid, version, architecture = row + details = dict(pid=pid, + package=package, + version=version, + architecture=architecture) + cur.execute("SELECT count(filename), sum(size) FROM content WHERE pid = ?;", + (pid,)) + num_files, total_size = cur.fetchone() if total_size is None: total_size = 0 details.update(dict(num_files=num_files, total_size=total_size)) return details def get_dependencies(self, pid): - cur = self.db.cursor() - cur.execute("SELECT required FROM dependency WHERE pid = ?;", - (pid,)) - return set(row[0] for row in fetchiter(cur)) + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT required FROM dependency WHERE pid = ?;", + (pid,)) + return set(row[0] for row in fetchiter(cur)) def cached_sharedstats(self, pid): - cur = self.db.cursor() sharedstats = {} - cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", - (pid,)) - for pid2, package2, func1, func2, files, size in fetchiter(cur): - curstats = sharedstats.setdefault( - function_combination(func1, func2), list()) - if pid2 == pid: - package2 = None - curstats.append(dict(package=package2, duplicate=files, savable=size)) + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT pid2, package.name, f1.name, f2.name, files, size FROM sharing JOIN package ON sharing.pid2 = package.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE pid1 = ? AND f1.eqclass = f2.eqclass;", + (pid,)) + for pid2, package2, func1, func2, files, size in fetchiter(cur): + curstats = sharedstats.setdefault( + function_combination(func1, func2), list()) + if pid2 == pid: + package2 = None + curstats.append(dict(package=package2, duplicate=files, + savable=size)) return sharedstats def show_package(self, package): @@ -206,12 +208,12 @@ class Application(object): return html_response(detail_template.stream(params)) def show_hash(self, function, hashvalue): - cur = self.db.cursor() - cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", - (function, hashvalue,)) - entries = [dict(package=package, filename=filename, size=size, - function=otherfunc) - for package, filename, size, otherfunc in fetchiter(cur)] + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT package.name, content.filename, content.size, f2.name FROM hash JOIN content ON hash.cid = content.id JOIN package ON content.pid = package.id JOIN function AS f2 ON hash.fid = f2.id JOIN function AS f1 ON f2.eqclass = f1.eqclass WHERE f1.name = ? AND hash = ?;", + (function, hashvalue,)) + entries = [dict(package=package, filename=filename, size=size, + function=otherfunc) + for package, filename, size, otherfunc in fetchiter(cur)] if not entries: raise NotFound() params = dict(function=function, hashvalue=hashvalue, entries=entries, @@ -219,21 +221,21 @@ class Application(object): return html_response(hash_template.render(params)) def show_source(self, package): - cur = self.db.cursor() - cur.execute("SELECT name FROM package WHERE source = ?;", - (package,)) - binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) - if not binpkgs: - raise NotFound - cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;", - (package,)) - for binary, otherbin, func1, func2, files, size in fetchiter(cur): - entry = dict(package=otherbin, - funccomb=function_combination(func1, func2), - duplicate=files, savable=size) - oldentry = binpkgs.get(binary) - if not (oldentry and oldentry["savable"] >= size): - binpkgs[binary] = entry + with contextlib.closing(self.db.cursor()) as cur: + cur.execute("SELECT name FROM package WHERE source = ?;", + (package,)) + binpkgs = dict.fromkeys(pkg for pkg, in fetchiter(cur)) + if not binpkgs: + raise NotFound + cur.execute("SELECT p1.name, p2.name, f1.name, f2.name, sharing.files, sharing.size FROM sharing JOIN package AS p1 ON sharing.pid1 = p1.id JOIN package AS p2 ON sharing.pid2 = p2.id JOIN function AS f1 ON sharing.fid1 = f1.id JOIN function AS f2 ON sharing.fid2 = f2.id WHERE p1.source = ?;", + (package,)) + for binary, otherbin, func1, func2, files, size in fetchiter(cur): + entry = dict(package=otherbin, + funccomb=function_combination(func1, func2), + duplicate=files, savable=size) + oldentry = binpkgs.get(binary) + if not (oldentry and oldentry["savable"] >= size): + binpkgs[binary] = entry params = dict(source=package, packages=binpkgs, urlroot="..") return html_response(source_template.render(params)) -- cgit v1.2.3 From 786c4f93ea318a3c22479f80531594435fb036c3 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 11 Sep 2013 08:35:41 +0200 Subject: webapp: open cursors less often On the main instance opening cursors equals initiating a connection. Unfortunately sqlite3.Connection.close does not close filedescriptors. So just open less cursors to leak filedescriptors less often. --- webapp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/webapp.py b/webapp.py index 4478ba0..665ac23 100755 --- a/webapp.py +++ b/webapp.py @@ -166,6 +166,7 @@ class Application(object): cursize = -1 files = dict() minmatch = 2 if pid1 == pid2 else 1 + cur2 = self.db.cursor() for cid, filename, size, hashvalue in fetchiter(cur): if cursize != size: for entry in files.values(): @@ -181,13 +182,12 @@ class Application(object): entry = dict(filenames=set((filename,)), size=size, matches={}) files[hashvalue] = entry - cur2 = self.db.cursor() cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;", (cid, pid2)) for func1, hashvalue, func2, filename in fetchiter(cur2): entry["matches"].setdefault(filename, {})[func1, func2] = \ hashvalue - cur2.close() + cur2.close() cur.close() for entry in files.values(): -- cgit v1.2.3 From d228c0a4a5827325bca47d63ea287c7cb56537ea Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Thu, 3 Oct 2013 08:51:41 +0200 Subject: work around python-debian's #670679 --- dedup/debpkg.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dedup/debpkg.py b/dedup/debpkg.py index d8cc22f..2d67135 100644 --- a/dedup/debpkg.py +++ b/dedup/debpkg.py @@ -16,8 +16,8 @@ def process_control(control_contents): source = package version = control["version"].encode("ascii") architecture = control["architecture"].encode("ascii") - - depends = set(dep[0]["name"].encode("ascii") + # deb822 currently returns :any dependencies raw. see #670679 + depends = set(dep[0]["name"].split(u':', 1)[0].encode("ascii") for dep in control.relations.get("depends", ()) if len(dep) == 1) return dict(package=package, source=source, version=version, -- cgit v1.2.3 From 17597b5e828f9bbc9b0159102b173c284c23a140 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 19 Feb 2014 07:54:21 +0100 Subject: DecompressedHash should fail on trailing input Otherwise all files smaller than 10 bytes are successfully hashed to the hash of the empty input when using the GzipDecompressor. Reported-By: Olly Betts --- dedup/hashing.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dedup/hashing.py b/dedup/hashing.py index 002eda8..5f015b2 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -49,9 +49,13 @@ class DecompressedHash(object): def hexdigest(self): if not hasattr(self.decompressor, "flush"): + if self.decompressor.unused_data: + raise ValueError("decompressor did not consume all data") return self.hashobj.hexdigest() tmpdecomp = self.decompressor.copy() data = tmpdecomp.flush() + if tmpdecomp.unused_data: + raise ValueError("decompressor did not consume all data") tmphash = self.hashobj.copy() tmphash.update(data) return tmphash.hexdigest() -- cgit v1.2.3 From d467a2a4e85d4b6f09bd2e3dc70466bfcc45a577 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Wed, 19 Feb 2014 14:19:56 +0100 Subject: GzipDecompressor: don't treat checksum as garbage trailer --- dedup/compression.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/dedup/compression.py b/dedup/compression.py index 869c49f..4ce258c 100644 --- a/dedup/compression.py +++ b/dedup/compression.py @@ -5,8 +5,11 @@ class GzipDecompressor(object): """An interface to gzip which is similar to bz2.BZ2Decompressor and lzma.LZMADecompressor.""" def __init__(self): + self.sawheader = False self.inbuffer = b"" self.decompressor = None + self.crc = 0 + self.size = 0 def decompress(self, data): """ @@ -16,6 +19,8 @@ class GzipDecompressor(object): while True: if self.decompressor: data = self.decompressor.decompress(data) + self.crc = zlib.crc32(data, self.crc) + self.size += len(data) unused_data = self.decompressor.unused_data if not unused_data: return data @@ -45,13 +50,20 @@ class GzipDecompressor(object): return b"" data = self.inbuffer[skip:] self.inbuffer = b"" + self.sawheader = True self.decompressor = zlib.decompressobj(-zlib.MAX_WBITS) @property def unused_data(self): if self.decompressor: return self.decompressor.unused_data + elif not self.sawheader: + return self.inbuffer else: + expect = struct.pack(" Date: Wed, 19 Feb 2014 14:21:20 +0100 Subject: blacklist content rather than hashes Otherwise the gzip hash cannot tell the empty stream and the compressed empty stream apart. --- dedup/hashing.py | 50 ++++++++++++++++++++++++++++++++++++++++++++++++++ importpkg.py | 15 +++++---------- 2 files changed, 55 insertions(+), 10 deletions(-) diff --git a/dedup/hashing.py b/dedup/hashing.py index 5f015b2..70f6268 100644 --- a/dedup/hashing.py +++ b/dedup/hashing.py @@ -1,3 +1,5 @@ +import itertools + class HashBlacklist(object): """Turn a hashlib-like object into a hash that returns None for some blacklisted hashes instead of the real hash value. @@ -29,6 +31,54 @@ class HashBlacklist(object): def copy(self): return HashBlacklist(self.hashobj.copy(), self.blacklist) +class HashBlacklistContent(object): + """Turn a hashlib-like object into a hash that returns None for some + blacklisted content instead of the real hash value. Unlike HashBlacklist, + not the output of the hash is considered, but its input.""" + + def __init__(self, hashobj, blacklist=(), maxlen=None): + """ + @param hashobj: a hashlib-like object + @param blacklist: an object providing __contains__. + hash inputs which are contained in the blacklist + are turned into None values + @param maxlen: the maximum length of a blacklisted input. + Defaults to max(map(len, blacklist)), so if it is absent, + the blacklist must support iteration. + """ + self.hashobj = hashobj + self.blacklist = blacklist + if maxlen is None: + # the chain avoids passing the empty sequence to max + maxlen = max(itertools.chain((0,), itertools.imap(len, blacklist))) + self.maxlen = maxlen + self.stored = "" + + @property + def name(self): + return self.hashobj.name + + def update(self, data): + if self.stored is not None: + self.stored += data + if len(self.stored) > self.maxlen: + self.stored = None + self.hashobj.update(data) + + def digest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.digest() + + def hexdigest(self): + if self.stored is not None and self.stored in self.blacklist: + return None + return self.hashobj.hexdigest() + + def copy(self): + return HashBlacklistContent(self.hashobj.copy(), self.blacklist, + self.maxlen) + class DecompressedHash(object): """Apply a decompression function before the hash. This class provides the hashlib interface (update, hexdigest, copy) excluding digest and name.""" diff --git a/importpkg.py b/importpkg.py index 54f6181..cb16f97 100755 --- a/importpkg.py +++ b/importpkg.py @@ -16,26 +16,21 @@ import yaml from dedup.arreader import ArReader from dedup.debpkg import process_control, get_tar_hashes -from dedup.hashing import HashBlacklist, DecompressedHash, SuppressingHash, \ - HashedStream +from dedup.hashing import DecompressedHash, SuppressingHash, HashedStream, \ + HashBlacklistContent from dedup.compression import GzipDecompressor, DecompressedStream from dedup.image import GIFHash, PNGHash -boring_sha512_hashes = set(( - # "" - "cf83e1357eefb8bdf1542850d66d8007d620e4050b5715dc83f4a921d36ce9ce47d0d13c5d85f2b0ff8318d2877eec2f63b931bd47417a81a538327af927da3e", - # "\n" - "be688838ca8686e5c90689bf2ab585cef1137c999b48c70b92f67a5c34dc15697b5d11c982ed6d71be1e1e7f7b4e0733884aa97c3f7a339a8ed03577cf74be09")) +boring_content = set(("", "\n")) def sha512_nontrivial(): - return HashBlacklist(hashlib.sha512(), boring_sha512_hashes) + return HashBlacklistContent(hashlib.sha512(), boring_content) def gziphash(): hashobj = DecompressedHash(GzipDecompressor(), hashlib.sha512()) hashobj = SuppressingHash(hashobj, (ValueError, zlib.error)) hashobj.name = "gzip_sha512" - # don't blacklist boring hashes for gzip to get gzip issues right - return hashobj + return HashBlacklistContent(hashobj, boring_content) def pnghash(): hashobj = PNGHash(hashlib.sha512()) -- cgit v1.2.3 From 7389e4b00f6add611e8d6b318654056097d6d546 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Fri, 21 Feb 2014 21:59:04 +0100 Subject: update_sharing: weaken assumptions about db layout Hash functions are partitioned into equivalence classes. We are generally only interested in sharing among hash functions with the same equivalence class, but the algorithm would compute any sharing. While the current layout never produces the same hashes for functions in difference equivalence classes (for different output length), that may change in future. Also allow hash functions, that belong to no equivalence class at all (eqclass = NULL) as a means to add additional metadata to content without computing any sharing for it. --- update_sharing.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/update_sharing.py b/update_sharing.py index 1ff0fd8..ca6890b 100755 --- a/update_sharing.py +++ b/update_sharing.py @@ -47,14 +47,20 @@ def main(db): readcur = db.cursor() readcur.execute("SELECT hash FROM hash GROUP BY hash HAVING count(*) > 1;") for hashvalue, in fetchiter(readcur): - cur.execute("SELECT content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id WHERE hash = ?;", + cur.execute("SELECT function.eqclass, content.pid, content.id, content.filename, content.size, hash.fid FROM hash JOIN content ON hash.cid = content.id JOIN function ON hash.fid = function.id AND function.eqclass IS NOT NULL WHERE hash = ?;", (hashvalue,)) - rows = cur.fetchall() - print("processing hash %s with %d entries" % (hashvalue, len(rows))) - pkgdict = compute_pkgdict(rows) - cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", - [(row[1],) for row in rows]) - process_pkgdict(cur, pkgdict) + rowdict = dict() + for row in cur.fetchall(): + rowdict.setdefault(row[0], []).append(row[1:]) + for eqclass, rows in rowdict.items(): + if len(rows) < 2: + print("skipping hash %s class %d with too few entries" % (hashvalue, eqclass)) + continue + print("processing hash %s class %d with %d entries" % (hashvalue, eqclass, len(rows))) + pkgdict = compute_pkgdict(rows) + cur.executemany("INSERT OR IGNORE INTO duplicate (cid) VALUES (?);", + [(row[1],) for row in rows]) + process_pkgdict(cur, pkgdict) cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'file named something.gz is not a valid gzip file' FROM content WHERE content.filename LIKE '%.gz' AND NOT EXISTS (SELECT 1 FROM hash JOIN function ON hash.fid = function.id WHERE hash.cid = content.id AND function.name = 'gzip_sha512');") cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'png image not named something.png' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'png_sha512' AND lower(filename) NOT LIKE '%.png';") cur.execute("INSERT INTO issue (cid, issue) SELECT content.id, 'gif image not named something.gif' FROM content JOIN hash ON content.id = hash.cid JOIN function ON hash.fid = function.id WHERE function.name = 'gif_sha512' AND lower(filename) NOT LIKE '%.gif';") -- cgit v1.2.3 From b38f14ab3fb72ca1578d7e6bb09178e6fbebba76 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 23 Feb 2014 15:44:03 +0100 Subject: webapp: fix eqclass usage in package comparison When comparing two packages, objects would be considered duplicates without considering whether the respective hash functions are comparable by checking their equivalence classes. The current set of hash functions does not expose this bug. --- webapp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/webapp.py b/webapp.py index 665ac23..fd6d685 100755 --- a/webapp.py +++ b/webapp.py @@ -182,7 +182,7 @@ class Application(object): entry = dict(filenames=set((filename,)), size=size, matches={}) files[hashvalue] = entry - cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ?;", + cur2.execute("SELECT fa.name, ha.hash, fb.name, filename FROM hash AS ha JOIN hash AS hb ON ha.hash = hb.hash JOIN content ON hb.cid = content.id JOIN function AS fa ON ha.fid = fa.id JOIN function AS fb ON hb.fid = fb.id WHERE ha.cid = ? AND pid = ? AND fa.eqclass = fb.eqclass;", (cid, pid2)) for func1, hashvalue, func2, filename in fetchiter(cur2): entry["matches"].setdefault(filename, {})[func1, func2] = \ -- cgit v1.2.3 From 8ccd5205f77276b333c56efb8271a0ddf11590a0 Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Sun, 23 Feb 2014 17:29:41 +0100 Subject: fix spelling mistake Reported-By: Stefan Kaltenbrunner --- dedup/templates/index.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dedup/templates/index.html b/dedup/templates/index.html index 7c9000f..169027e 100644 --- a/dedup/templates/index.html +++ b/dedup/templates/index.html @@ -28,7 +28,7 @@ {% block content %}

Debian duplication detector

    -
  • To inspect a particlar binary package, go to
    binary/<packagename>
    Example: binary/git +
  • To inspect a particular binary package, go to
    binary/<packagename>
    Example: binary/git