From e123f857a74e2ec8742ceebddebd7d296dcaca3c Mon Sep 17 00:00:00 2001 From: Helmut Grohne Date: Fri, 10 Jun 2016 07:26:12 +0200 Subject: add a separate tool for generating hints on Multi-Arch headers It builds on the core functionality of dedup, but uses a different database schema. Unlike dedup, it aborts downloading Arch:all packages early and consumes any other architecture in its entirety instead. --- README.multiarch | 14 +++ multiarchanalyze.py | 114 ++++++++++++++++++++++++ multiarchanalyze.sql | 199 ++++++++++++++++++++++++++++++++++++++++++ multiarchimport.py | 242 +++++++++++++++++++++++++++++++++++++++++++++++++++ multiarchschema.sql | 31 +++++++ 5 files changed, 600 insertions(+) create mode 100644 README.multiarch create mode 100755 multiarchanalyze.py create mode 100644 multiarchanalyze.sql create mode 100755 multiarchimport.py create mode 100644 multiarchschema.sql diff --git a/README.multiarch b/README.multiarch new file mode 100644 index 0000000..120f119 --- /dev/null +++ b/README.multiarch @@ -0,0 +1,14 @@ +There is a separate analysis application for detecting multiarch related bugs. +It has the same requirements as the dedup application. It uses four files: + + multiarchschema.sql + multiarchimport.py + multiarchanalyze.sql + multiarchanalyze.py + +With the schema you can initialize a database. The importer allows feeding the +initial dataset or an updated dataset into the database by collecting data from +a nearby Debian mirror. It is recommended to apply multiarchanalyze.sql once +after the initial data import to avoid lots of changes on unused indices. The +final step is generating the hints by running multiarchanalyze.py. Unless +issuing ANALYZE, the analysis will be slow. diff --git a/multiarchanalyze.py b/multiarchanalyze.py new file mode 100755 index 0000000..5cd5de7 --- /dev/null +++ b/multiarchanalyze.py @@ -0,0 +1,114 @@ +#!/usr/bin/python + +import argparse +import sqlite3 + +from dedup.utils import fetchiter + +def bipartitions(edges): + """ + @type edges: {T: set(T)} + @rtype: [(set(T), set(T))] + """ + todo = set(edges.keys()) + result = [] + colors = {} + while todo: + node = todo.pop() + colors[node] = False + partition = (set((node,)), set()) + todo2 = set((node,)) + while todo2: + partnode = todo2.pop() + for nextnode in edges[partnode]: + try: + if colors[nextnode] == colors[partnode]: + raise ValueError("not bipartite") + except KeyError: + colors[nextnode] = not colors[partnode] + partition[colors[nextnode]].add(nextnode) + todo2.add(nextnode) + todo.remove(nextnode) + result.append(partition) + return result + +def show_archset(archs, limit=3): + """ + @type archs: set(str) + @rtype: str + """ + archs = set(archs) + if len(archs) > limit: + return "%d archs" % len(archs) + return ", ".join(sorted(archs)) + +def show_combinations(combinations): + edges = {} + for left, right in combinations: + edges.setdefault(left, set()).add(right) + edges.setdefault(right, set()).add(left) # safety + if len(edges) == 2: + return "%s <-> %s" % (min(left, right), max(left, right)) + try: + partitions = bipartitions(edges) + except ValueError: + pass + else: + if len(partitions) == 1: + return " <-> ".join(show_archset(archs, 4) + for archs in sorted(partitions[0], key=len)) + else: + return "%d bipartitions of %s" % (len(partitions), + show_archset(edges.keys())) + if all(len(outedges) == len(edges) - 1 for outedges in edges.values()): + return "any two of " + show_archset(edges.keys(), limit=4) + return "%s with %d combinations" % (show_archset(edges.keys()), + len(combinations)) + +def show_files(filenames): + if len(filenames) == 1: + return next(iter(filenames)) + return "%d files" % len(filenames) + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--database", action="store", + default="test.sqlite3", + help="path to the sqlite3 database file") + args = parser.parse_args() + db = sqlite3.connect(args.database) + cur = db.cursor() + cur.execute("SELECT name1, architecture1, architecture2, filename FROM masame_conflict;") + same_conflicts = {} + for package, arch1, arch2, filename in fetchiter(cur): + conflicts = same_conflicts.setdefault(package, {}) + conflicts.setdefault(filename, set()).add((arch1, arch2)) + for package, conflicts in sorted(same_conflicts.items()): + all_combinations = set() + for combinations in conflicts.values(): + all_combinations.update(combinations) + print("%s conflicts on %s on %s" % + (package, show_files(conflicts.keys()), + show_combinations(all_combinations))) + + cur.execute("SELECT name FROM maforeign_candidate ORDER BY name;") + for name, in fetchiter(cur): + print("%s could be Multi-Arch: foreign" % name) + + cur.execute("SELECT name FROM archall_candidate ORDER BY name;") + archall_suggests = set() + for name, in fetchiter(cur): + archall_suggests.add(name) + print("%s could be converted to Architecture: all + Multi-Arch: foreign" % name) + + cur.execute("SELECT name FROM masame_candidate ORDER BY name;") + for name, in fetchiter(cur): + if name not in archall_suggests: + print("%s could be Multi-Arch: same" % name) + cur.execute("SELECT depender, dependee FROM colonany_candidate ORDER BY depender;") + for depender, dependee in fetchiter(cur): + print("%s could have its %s dependency annotated with :any" % + (depender, dependee)) + +if __name__ == "__main__": + main() diff --git a/multiarchanalyze.sql b/multiarchanalyze.sql new file mode 100644 index 0000000..6b26ecd --- /dev/null +++ b/multiarchanalyze.sql @@ -0,0 +1,199 @@ +CREATE INDEX IF NOT EXISTS depends_dependee_index ON depends(dependee); +CREATE INDEX IF NOT EXISTS content_filename_index ON content(filename); + +BEGIN; + +/* Combinations of two packages and files with conflicting content. */ +DROP VIEW IF EXISTS contentconflict; +CREATE VIEW contentconflict AS + SELECT p1.id AS pid1, + p2.id AS pid2, + p1.name AS name1, + p2.name AS name2, + p1.version AS version1, + p2.version AS version2, + p1.architecture AS architecture1, + p2.architecture AS architecture2, + p1.source AS source1, + p2.source AS source2, + p1.hasscripts AS hasscripts1, + p2.hasscripts AS hasscripts2, + p1.multiarch AS multiarch1, + p2.multiarch AS multiarch2, + c1.filename, + c1.id AS cid1, + c2.id AS cid2, + c1.hash AS hash1, + c2.hash AS hash2 + FROM package AS p1, package AS p2, content AS c1, content AS c2 + WHERE p1.id = c1.pid + AND p2.id = c2.pid + AND c1.filename = c2.filename + AND c1.hash != c2.hash; + +/* Candidates for satisfying architecture-enforcing dependencies considering + * provides. */ +DROP VIEW IF EXISTS archdepcandidate; +CREATE VIEW archdepcandidate AS + SELECT p1.id AS dependerid, p2.id AS dependeeid + FROM package AS p1, depends AS d, package AS p2 + WHERE p1.id = d.pid + AND d.dependee = p2.name + AND p2.multiarch IS NOT 'foreign' + AND (p2.multiarch IS NOT 'allowed' OR d.archqual IS NOT 'any') + AND (p1.architecture = p2.architecture + OR p1.architecture = 'all' + OR p2.architecture = 'all' + OR p2.architecture = d.archqual) + UNION + SELECT p1.id AS dependerid, p2.id AS dependeeid + FROM package AS p1, depends AS d, provides AS r, package AS p2 + WHERE p1.id = d.pid + AND d.dependee = r.provided + AND r.pid = p2.id + AND p2.multiarch IS NOT 'foreign' + AND (p2.multiarch IS NOT 'allowed' OR d.archqual IS NOT 'any') + AND (p1.architecture = p2.architecture + OR p1.architecture = 'all' + OR p2.architecture = 'all' + OR p2.architecture = d.archqual); + +/* Describe properties of binary package names. */ +DROP VIEW IF EXISTS packageapi; +CREATE VIEW packageapi AS + SELECT name, + count(*) AS instances, + count(CASE architecture WHEN 'all' THEN 1 ELSE NULL END) + AS hasindep, + CASE count(CASE architecture WHEN 'all' THEN NULL ELSE 1 END) + WHEN 0 THEN 0 ELSE 1 END + AS hasarchdep, + CASE min(source) WHEN max(source) THEN source ELSE NULL END + AS source, + max(hasscripts) AS hasscripts, + CASE min(multiarch) + WHEN max(multiarch) THEN multiarch + ELSE NULL END + AS multiarch + FROM package GROUP BY name; + +/* Architecture-dependent packages. */ +DROP VIEW IF EXISTS archpackage; +CREATE VIEW archpackage AS + SELECT + p1.name, + p1.hasscripts, + p1.multiarch, + exists( + SELECT 1 FROM package AS p2, archdepcandidate AS c + WHERE p2.name = p1.name AND c.dependerid = p2.id) + AS hasarchdeps + FROM packageapi AS p1 + WHERE p1.instances >= 3 AND p1.hasindep = 0; + +/* Architecture-independent packages. */ +DROP VIEW IF EXISTS indeppackage; +CREATE VIEW indeppackage AS + SELECT + id, + name, + version, + source, + hasscripts, + multiarch, + exists(SELECT 1 FROM archdepcandidate WHERE dependerid = id) + AS hasarchdeps + FROM package + WHERE architecture = 'all'; + +/* Packages violating M-A:same by shipping conflicting files. */ +DROP VIEW IF EXISTS masame_conflict; +CREATE VIEW masame_conflict AS + SELECT name1, architecture1, architecture2, filename + FROM contentconflict + WHERE name1 = name2 + AND version1 = version2 + AND multiarch1 = 'same' + AND multiarch2 = 'same'; + +/* Packages that could be marked M-A:foreign, because they + * * are Architecture: all + * * do not have maintainer scripts + * * do not have architecture enforcing dependencies + */ +DROP VIEW IF EXISTS maforeign_candidate; +CREATE VIEW maforeign_candidate AS + SELECT p1.name FROM indeppackage AS p1 + WHERE p1.multiarch = 'no' + AND p1.hasscripts = 0 + AND p1.hasarchdeps = 0; + +/* Packages that could be converted from Arch:any to Arch:all M-A:foreign, + * because they + * * do not have maintainer scripts + * * do not ship conflicting files + * * do not ship different file sets + * * do not have architecture enforcing dependencies + */ +DROP VIEW IF EXISTS archall_candidate; +CREATE VIEW archall_candidate AS + SELECT p1.name FROM archpackage AS p1 + WHERE p1.hasscripts = 0 + AND NOT EXISTS ( + SELECT 1 FROM contentconflict + WHERE name1 = p1.name AND name2 = p1.name) + AND NOT EXISTS ( + SELECT 1 + FROM package AS p2, content AS c2, package AS p3 + WHERE p2.name = p1.name + AND c2.pid = p2.id + AND p3.name = p1.name + AND NOT EXISTS ( + SELECT 1 FROM content AS c5 + WHERE c5.pid = p3.id + AND c5.filename = c2.filename)) + AND p1.hasarchdeps = 0; + +/* Packages that coud be marked Multi-Arch:same, because they + * * are architecture dependent + * * do not have a Multi-Arch marking + * * do not have maintainer scripts + * * do not have file conflicts + */ +DROP VIEW IF EXISTS masame_candidate; +CREATE VIEW masame_candidate AS + SELECT p1.name FROM archpackage AS p1 + WHERE p1.multiarch = 'no' + AND p1.hasscripts = 0 + AND NOT EXISTS ( + SELECT 1 FROM contentconflict + WHERE name1 = p1.name AND name2 = p1.name); + +/* Package 'depender' has a dependency on 'dependee' that can be annotated with + * :any, because + * * 'depender' is architecture independent + * * 'depender' does not have maintainer scripts + * * the dependency on 'dependee' is the only architecture enforcing dependency + * * 'dependee' is Multi-Arch: allowed + */ +DROP VIEW IF EXISTS colonany_candidate; +CREATE VIEW colonany_candidate AS + SELECT depender, dependee + FROM ( + SELECT + p1.name AS depender, + p2.name AS dependee, + CASE max(p2.multiarch) + WHEN min(p2.multiarch) THEN p2.multiarch + ELSE NULL END + AS multiarch + FROM indeppackage AS p1, archdepcandidate AS c, package AS p2 + WHERE p1.hasscripts = 0 + AND c.dependerid = p1.id + AND c.dependeeid = p2.id + GROUP BY p1.id, p2.name) + GROUP BY depender + HAVING count(dependee) = 1 + AND min(CASE multiarch WHEN 'allowed' THEN 1 ELSE 0 END) = 1; + +COMMIT; diff --git a/multiarchimport.py b/multiarchimport.py new file mode 100755 index 0000000..37fa8f5 --- /dev/null +++ b/multiarchimport.py @@ -0,0 +1,242 @@ +#!/usr/bin/python + +import argparse +from contextlib import closing +import hashlib +import logging +import multiprocessing +import Queue +import sqlite3 +try: + from urllib.request import urlopen +except ImportError: + from urllib2 import urlopen + +from debian import deb822 +from debian.debian_support import version_compare + +from dedup.debpkg import DebExtractor, decodetarname +from dedup.hashing import hash_file +from dedup.utils import fetchiter, open_compressed_mirror_url + +def release_architectures(mirror): + with closing(urlopen("%s/dists/testing/Release" % mirror)) as f: + return set(deb822.Release(f)["Architectures"].split()) + +def gather_pkgmeta(pkg): + meta = dict(depends=set(), provides=set()) + for key in ("version", "filename"): + meta[key] = pkg[key] + try: + meta["multiarch"] = pkg["multi-arch"] + except KeyError: + pass + try: + meta["source"] = pkg["source"].split(None, 1)[0] + except KeyError: + pass + for kind, relations in pkg.relations.items(): + if kind == "pre-depends": + kind = "depends" + elif kind not in ("provides", "depends"): + continue + for relation in relations: + if kind == "provides" and len(relation) != 1: + raise ValueError("provides with alternatives") + for alternative in relation: + name = alternative["name"] + # deb822 currently returns :any dependencies raw. see #670679 + archqual = alternative.get("archqual") + if ':' in name: + name, archqual = name.split(None, 1)[0].split(u':') + if kind == "provides": + if archqual: + raise ValueError( + "arch qualification not supported on provides") + meta[kind].add(name) + else: + meta[kind].add((name, archqual)) + return meta + +def process_list(mirror, architecture): + pkgs = dict() + url = "%s/dists/sid/main/binary-%s/Packages" % (mirror, architecture) + with closing(open_compressed_mirror_url(url)) as pkglist: + for pkg in deb822.Packages.iter_paragraphs(pkglist): + if pkg["architecture"] != architecture: + continue # e.g. Arch: all + try: + otherver = pkgs[pkg["package"]]["version"] + except KeyError: + pass + else: + if version_compare(otherver, pkg["version"]) < 0: + continue + pkgs[pkg["package"]] = gather_pkgmeta(pkg) + return pkgs + +def process_one_list(item): + mirror, arch = item + return (arch, process_list(mirror, arch)) + +class ProcessingFinished(Exception): + pass + +class MultiarchExtractor(DebExtractor): + def __init__(self): + DebExtractor.__init__(self) + self.architecture = None + self.result = dict(hasscripts=False) + + def handle_control_info(self, info): + self.architecture = info["architecture"] + + def handle_control_member(self, name, content): + if name in ("preinst", "postinst", "prerm", "postrm"): + self.result["hasscripts"] = True + + def handle_control_end(self): + if self.architecture == "all": + raise ProcessingFinished + + def handle_data_tar(self, tarfileobj): + self.result["contents"] = contents = dict() + for elem in tarfileobj: + try: + name = decodetarname(elem.name) + except UnicodeDecodeError: + logging.warning("skipping filename with encoding error %r", + elem.name) + continue # skip files with non-utf8 encoding for now + if elem.isreg(): + hasher = hashlib.sha256() + hash_file(hasher, tarfileobj.extractfile(elem)) + contents[name] = "R" + hasher.hexdigest() + elif elem.issym() or elem.islnk(): + try: + linkname = decodetarname(elem.linkname) + except UnicodeDecodeError: + logging.warning("skipping link with encoding error %r", + elem.linkname) + continue + if elem.issym(): + contents[name] = u"S" + linkname + else: + try: + contents[name] = contents[linkname] + except KeyError: + logging.warning("hard link to non-existent file %r", + linkname) + raise ProcessingFinished + +def process_one_package(item): + pid, url = item + extractor = MultiarchExtractor() + with closing(urlopen(url)) as pkgfile: + try: + extractor.process(pkgfile) + except ProcessingFinished: + pass + return (pid, extractor.result, url) + +def consume_items(dct): + while True: + try: + yield dct.popitem() + except KeyError: + break + +def bounded_imap_unordered(bound, pool, function, iterable): + iterable = iter(iterable) + results = Queue.Queue() + outstanding = 0 + while iterable or outstanding: + if iterable: + for elem in iterable: + pool.apply_async(function, (elem,), callback=results.put) + outstanding += 1 + if outstanding >= bound or not results.empty(): + break + else: + iterable = None + if outstanding: + yield results.get() + outstanding -= 1 + +def insert_package(cur, name, arch, meta): + logging.debug("adding %s %s", name, meta["version"]) + cur.execute("INSERT INTO package (name, version, architecture, source, multiarch) VALUES (?, ?, ?, ?, ?);", + (name, meta["version"], arch, meta.get("source", name), + meta.get("multiarch", "no"))) + pid = cur.lastrowid + logging.debug("added %s as %d", name, pid) + if meta["depends"]: + logging.debug("inserting %d dependencies for %s", + len(meta["depends"]), name) + cur.executemany("INSERT INTO depends (pid, dependee, archqual) VALUES (?, ?, ?);", + ((pid, dependee, archqual) + for dependee, archqual in meta["depends"])) + if meta["provides"]: + logging.debug("inserting %d provides for %s", + len(meta["provides"]), name) + cur.executemany("INSERT INTO provides (pid, provided) VALUES (?, ?);", + ((pid, provided) for provided in meta["provides"])) + return pid + +def main(): + logging.basicConfig(level=logging.DEBUG) + parser = argparse.ArgumentParser() + parser.add_argument("-d", "--database", action="store", + default="test.sqlite3", + help="path to the sqlite3 database file") + parser.add_argument("-m", "--mirror", action="store", + default="http://httpredir.debian.org/debian", + help="Debian mirror to use") + args = parser.parse_args() + architectures = release_architectures(args.mirror) + architectures.add("all") + workers = multiprocessing.cpu_count() + pool = multiprocessing.Pool(workers) + db = sqlite3.connect(args.database) + cur = db.cursor() + cur.execute("PRAGMA foreign_keys = ON;") + todo = {} + for arch, pkgs in bounded_imap_unordered(3, pool, process_one_list, + ((args.mirror, arch) + for arch in architectures)): + logging.info("found %d Packages for architecture %s", len(pkgs), arch) + cur.execute("SELECT name, id, version FROM package WHERE architecture = ?;", + (arch,)) + remove = set() + for name, pid, version in fetchiter(cur): + if name in pkgs and version == pkgs[name]["version"]: + del pkgs[name] + else: + remove.add(pid) + logging.info("%d Packages are new", len(pkgs)) + logging.info("removing %d old packages", len(remove)) + cur.executemany("DELETE FROM package WHERE id = ?;", + ((pid,) for pid in remove)) + del remove + + for name, meta in pkgs.items(): + pid = insert_package(cur, name, arch, meta) + todo[pid] = meta["filename"] + del pkgs + + logging.info("need to fetch %d packages", len(todo)) + for pid, result, url in bounded_imap_unordered(2 * workers, pool, + process_one_package, + ((pid, "%s/%s" % (args.mirror, filename)) + for pid, filename in consume_items(todo))): + logging.debug("fetched %d from %s", pid, url) + cur.execute("UPDATE package SET hasscripts = ? WHERE id = ?;", + (result["hasscripts"], pid)) + if "contents" in result: + cur.executemany("INSERT INTO content (pid, filename, hash) VALUES (?, ?, ?);", + ((pid, key, value) + for key, value in result["contents"].items())) + db.commit() + +if __name__ == "__main__": + main() diff --git a/multiarchschema.sql b/multiarchschema.sql new file mode 100644 index 0000000..d3dc2eb --- /dev/null +++ b/multiarchschema.sql @@ -0,0 +1,31 @@ +CREATE TABLE package ( + id INTEGER PRIMARY KEY, + name TEXT NOT NULL, + version TEXT NOT NULL, + architecture TEXT NOT NULL, + source TEXT NOT NULL, + hasscripts BOOL, + multiarch TEXT + CHECK (multiarch IN ('allowed', 'foreign', 'no', 'same')), + UNIQUE (name, architecture)); + +CREATE TABLE depends ( + id INTEGER PRIMARY KEY, + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + dependee TEXT NOT NULL, + archqual TEXT); + +CREATE TABLE provides ( + id INTEGER PRIMARY KEY, + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + provided TEXT NOT NULL); + +CREATE TABLE content ( + id INTEGER PRIMARY KEY, + pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE, + filename TEXT NOT NULL, + hash TEXT NOT NULL); + +CREATE INDEX depends_pid_index ON depends(pid); +CREATE INDEX provides_pid_index ON provides(pid); +CREATE INDEX content_pid_index ON content(pid); -- cgit v1.2.3