summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README.multiarch14
-rwxr-xr-xmultiarchanalyze.py114
-rw-r--r--multiarchanalyze.sql199
-rwxr-xr-xmultiarchimport.py242
-rw-r--r--multiarchschema.sql31
5 files changed, 600 insertions, 0 deletions
diff --git a/README.multiarch b/README.multiarch
new file mode 100644
index 0000000..120f119
--- /dev/null
+++ b/README.multiarch
@@ -0,0 +1,14 @@
+There is a separate analysis application for detecting multiarch related bugs.
+It has the same requirements as the dedup application. It uses four files:
+
+ multiarchschema.sql
+ multiarchimport.py
+ multiarchanalyze.sql
+ multiarchanalyze.py
+
+With the schema you can initialize a database. The importer allows feeding the
+initial dataset or an updated dataset into the database by collecting data from
+a nearby Debian mirror. It is recommended to apply multiarchanalyze.sql once
+after the initial data import to avoid lots of changes on unused indices. The
+final step is generating the hints by running multiarchanalyze.py. Unless
+issuing ANALYZE, the analysis will be slow.
diff --git a/multiarchanalyze.py b/multiarchanalyze.py
new file mode 100755
index 0000000..5cd5de7
--- /dev/null
+++ b/multiarchanalyze.py
@@ -0,0 +1,114 @@
+#!/usr/bin/python
+
+import argparse
+import sqlite3
+
+from dedup.utils import fetchiter
+
+def bipartitions(edges):
+ """
+ @type edges: {T: set(T)}
+ @rtype: [(set(T), set(T))]
+ """
+ todo = set(edges.keys())
+ result = []
+ colors = {}
+ while todo:
+ node = todo.pop()
+ colors[node] = False
+ partition = (set((node,)), set())
+ todo2 = set((node,))
+ while todo2:
+ partnode = todo2.pop()
+ for nextnode in edges[partnode]:
+ try:
+ if colors[nextnode] == colors[partnode]:
+ raise ValueError("not bipartite")
+ except KeyError:
+ colors[nextnode] = not colors[partnode]
+ partition[colors[nextnode]].add(nextnode)
+ todo2.add(nextnode)
+ todo.remove(nextnode)
+ result.append(partition)
+ return result
+
+def show_archset(archs, limit=3):
+ """
+ @type archs: set(str)
+ @rtype: str
+ """
+ archs = set(archs)
+ if len(archs) > limit:
+ return "%d archs" % len(archs)
+ return ", ".join(sorted(archs))
+
+def show_combinations(combinations):
+ edges = {}
+ for left, right in combinations:
+ edges.setdefault(left, set()).add(right)
+ edges.setdefault(right, set()).add(left) # safety
+ if len(edges) == 2:
+ return "%s <-> %s" % (min(left, right), max(left, right))
+ try:
+ partitions = bipartitions(edges)
+ except ValueError:
+ pass
+ else:
+ if len(partitions) == 1:
+ return " <-> ".join(show_archset(archs, 4)
+ for archs in sorted(partitions[0], key=len))
+ else:
+ return "%d bipartitions of %s" % (len(partitions),
+ show_archset(edges.keys()))
+ if all(len(outedges) == len(edges) - 1 for outedges in edges.values()):
+ return "any two of " + show_archset(edges.keys(), limit=4)
+ return "%s with %d combinations" % (show_archset(edges.keys()),
+ len(combinations))
+
+def show_files(filenames):
+ if len(filenames) == 1:
+ return next(iter(filenames))
+ return "%d files" % len(filenames)
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-d", "--database", action="store",
+ default="test.sqlite3",
+ help="path to the sqlite3 database file")
+ args = parser.parse_args()
+ db = sqlite3.connect(args.database)
+ cur = db.cursor()
+ cur.execute("SELECT name1, architecture1, architecture2, filename FROM masame_conflict;")
+ same_conflicts = {}
+ for package, arch1, arch2, filename in fetchiter(cur):
+ conflicts = same_conflicts.setdefault(package, {})
+ conflicts.setdefault(filename, set()).add((arch1, arch2))
+ for package, conflicts in sorted(same_conflicts.items()):
+ all_combinations = set()
+ for combinations in conflicts.values():
+ all_combinations.update(combinations)
+ print("%s conflicts on %s on %s" %
+ (package, show_files(conflicts.keys()),
+ show_combinations(all_combinations)))
+
+ cur.execute("SELECT name FROM maforeign_candidate ORDER BY name;")
+ for name, in fetchiter(cur):
+ print("%s could be Multi-Arch: foreign" % name)
+
+ cur.execute("SELECT name FROM archall_candidate ORDER BY name;")
+ archall_suggests = set()
+ for name, in fetchiter(cur):
+ archall_suggests.add(name)
+ print("%s could be converted to Architecture: all + Multi-Arch: foreign" % name)
+
+ cur.execute("SELECT name FROM masame_candidate ORDER BY name;")
+ for name, in fetchiter(cur):
+ if name not in archall_suggests:
+ print("%s could be Multi-Arch: same" % name)
+ cur.execute("SELECT depender, dependee FROM colonany_candidate ORDER BY depender;")
+ for depender, dependee in fetchiter(cur):
+ print("%s could have its %s dependency annotated with :any" %
+ (depender, dependee))
+
+if __name__ == "__main__":
+ main()
diff --git a/multiarchanalyze.sql b/multiarchanalyze.sql
new file mode 100644
index 0000000..6b26ecd
--- /dev/null
+++ b/multiarchanalyze.sql
@@ -0,0 +1,199 @@
+CREATE INDEX IF NOT EXISTS depends_dependee_index ON depends(dependee);
+CREATE INDEX IF NOT EXISTS content_filename_index ON content(filename);
+
+BEGIN;
+
+/* Combinations of two packages and files with conflicting content. */
+DROP VIEW IF EXISTS contentconflict;
+CREATE VIEW contentconflict AS
+ SELECT p1.id AS pid1,
+ p2.id AS pid2,
+ p1.name AS name1,
+ p2.name AS name2,
+ p1.version AS version1,
+ p2.version AS version2,
+ p1.architecture AS architecture1,
+ p2.architecture AS architecture2,
+ p1.source AS source1,
+ p2.source AS source2,
+ p1.hasscripts AS hasscripts1,
+ p2.hasscripts AS hasscripts2,
+ p1.multiarch AS multiarch1,
+ p2.multiarch AS multiarch2,
+ c1.filename,
+ c1.id AS cid1,
+ c2.id AS cid2,
+ c1.hash AS hash1,
+ c2.hash AS hash2
+ FROM package AS p1, package AS p2, content AS c1, content AS c2
+ WHERE p1.id = c1.pid
+ AND p2.id = c2.pid
+ AND c1.filename = c2.filename
+ AND c1.hash != c2.hash;
+
+/* Candidates for satisfying architecture-enforcing dependencies considering
+ * provides. */
+DROP VIEW IF EXISTS archdepcandidate;
+CREATE VIEW archdepcandidate AS
+ SELECT p1.id AS dependerid, p2.id AS dependeeid
+ FROM package AS p1, depends AS d, package AS p2
+ WHERE p1.id = d.pid
+ AND d.dependee = p2.name
+ AND p2.multiarch IS NOT 'foreign'
+ AND (p2.multiarch IS NOT 'allowed' OR d.archqual IS NOT 'any')
+ AND (p1.architecture = p2.architecture
+ OR p1.architecture = 'all'
+ OR p2.architecture = 'all'
+ OR p2.architecture = d.archqual)
+ UNION
+ SELECT p1.id AS dependerid, p2.id AS dependeeid
+ FROM package AS p1, depends AS d, provides AS r, package AS p2
+ WHERE p1.id = d.pid
+ AND d.dependee = r.provided
+ AND r.pid = p2.id
+ AND p2.multiarch IS NOT 'foreign'
+ AND (p2.multiarch IS NOT 'allowed' OR d.archqual IS NOT 'any')
+ AND (p1.architecture = p2.architecture
+ OR p1.architecture = 'all'
+ OR p2.architecture = 'all'
+ OR p2.architecture = d.archqual);
+
+/* Describe properties of binary package names. */
+DROP VIEW IF EXISTS packageapi;
+CREATE VIEW packageapi AS
+ SELECT name,
+ count(*) AS instances,
+ count(CASE architecture WHEN 'all' THEN 1 ELSE NULL END)
+ AS hasindep,
+ CASE count(CASE architecture WHEN 'all' THEN NULL ELSE 1 END)
+ WHEN 0 THEN 0 ELSE 1 END
+ AS hasarchdep,
+ CASE min(source) WHEN max(source) THEN source ELSE NULL END
+ AS source,
+ max(hasscripts) AS hasscripts,
+ CASE min(multiarch)
+ WHEN max(multiarch) THEN multiarch
+ ELSE NULL END
+ AS multiarch
+ FROM package GROUP BY name;
+
+/* Architecture-dependent packages. */
+DROP VIEW IF EXISTS archpackage;
+CREATE VIEW archpackage AS
+ SELECT
+ p1.name,
+ p1.hasscripts,
+ p1.multiarch,
+ exists(
+ SELECT 1 FROM package AS p2, archdepcandidate AS c
+ WHERE p2.name = p1.name AND c.dependerid = p2.id)
+ AS hasarchdeps
+ FROM packageapi AS p1
+ WHERE p1.instances >= 3 AND p1.hasindep = 0;
+
+/* Architecture-independent packages. */
+DROP VIEW IF EXISTS indeppackage;
+CREATE VIEW indeppackage AS
+ SELECT
+ id,
+ name,
+ version,
+ source,
+ hasscripts,
+ multiarch,
+ exists(SELECT 1 FROM archdepcandidate WHERE dependerid = id)
+ AS hasarchdeps
+ FROM package
+ WHERE architecture = 'all';
+
+/* Packages violating M-A:same by shipping conflicting files. */
+DROP VIEW IF EXISTS masame_conflict;
+CREATE VIEW masame_conflict AS
+ SELECT name1, architecture1, architecture2, filename
+ FROM contentconflict
+ WHERE name1 = name2
+ AND version1 = version2
+ AND multiarch1 = 'same'
+ AND multiarch2 = 'same';
+
+/* Packages that could be marked M-A:foreign, because they
+ * * are Architecture: all
+ * * do not have maintainer scripts
+ * * do not have architecture enforcing dependencies
+ */
+DROP VIEW IF EXISTS maforeign_candidate;
+CREATE VIEW maforeign_candidate AS
+ SELECT p1.name FROM indeppackage AS p1
+ WHERE p1.multiarch = 'no'
+ AND p1.hasscripts = 0
+ AND p1.hasarchdeps = 0;
+
+/* Packages that could be converted from Arch:any to Arch:all M-A:foreign,
+ * because they
+ * * do not have maintainer scripts
+ * * do not ship conflicting files
+ * * do not ship different file sets
+ * * do not have architecture enforcing dependencies
+ */
+DROP VIEW IF EXISTS archall_candidate;
+CREATE VIEW archall_candidate AS
+ SELECT p1.name FROM archpackage AS p1
+ WHERE p1.hasscripts = 0
+ AND NOT EXISTS (
+ SELECT 1 FROM contentconflict
+ WHERE name1 = p1.name AND name2 = p1.name)
+ AND NOT EXISTS (
+ SELECT 1
+ FROM package AS p2, content AS c2, package AS p3
+ WHERE p2.name = p1.name
+ AND c2.pid = p2.id
+ AND p3.name = p1.name
+ AND NOT EXISTS (
+ SELECT 1 FROM content AS c5
+ WHERE c5.pid = p3.id
+ AND c5.filename = c2.filename))
+ AND p1.hasarchdeps = 0;
+
+/* Packages that coud be marked Multi-Arch:same, because they
+ * * are architecture dependent
+ * * do not have a Multi-Arch marking
+ * * do not have maintainer scripts
+ * * do not have file conflicts
+ */
+DROP VIEW IF EXISTS masame_candidate;
+CREATE VIEW masame_candidate AS
+ SELECT p1.name FROM archpackage AS p1
+ WHERE p1.multiarch = 'no'
+ AND p1.hasscripts = 0
+ AND NOT EXISTS (
+ SELECT 1 FROM contentconflict
+ WHERE name1 = p1.name AND name2 = p1.name);
+
+/* Package 'depender' has a dependency on 'dependee' that can be annotated with
+ * :any, because
+ * * 'depender' is architecture independent
+ * * 'depender' does not have maintainer scripts
+ * * the dependency on 'dependee' is the only architecture enforcing dependency
+ * * 'dependee' is Multi-Arch: allowed
+ */
+DROP VIEW IF EXISTS colonany_candidate;
+CREATE VIEW colonany_candidate AS
+ SELECT depender, dependee
+ FROM (
+ SELECT
+ p1.name AS depender,
+ p2.name AS dependee,
+ CASE max(p2.multiarch)
+ WHEN min(p2.multiarch) THEN p2.multiarch
+ ELSE NULL END
+ AS multiarch
+ FROM indeppackage AS p1, archdepcandidate AS c, package AS p2
+ WHERE p1.hasscripts = 0
+ AND c.dependerid = p1.id
+ AND c.dependeeid = p2.id
+ GROUP BY p1.id, p2.name)
+ GROUP BY depender
+ HAVING count(dependee) = 1
+ AND min(CASE multiarch WHEN 'allowed' THEN 1 ELSE 0 END) = 1;
+
+COMMIT;
diff --git a/multiarchimport.py b/multiarchimport.py
new file mode 100755
index 0000000..37fa8f5
--- /dev/null
+++ b/multiarchimport.py
@@ -0,0 +1,242 @@
+#!/usr/bin/python
+
+import argparse
+from contextlib import closing
+import hashlib
+import logging
+import multiprocessing
+import Queue
+import sqlite3
+try:
+ from urllib.request import urlopen
+except ImportError:
+ from urllib2 import urlopen
+
+from debian import deb822
+from debian.debian_support import version_compare
+
+from dedup.debpkg import DebExtractor, decodetarname
+from dedup.hashing import hash_file
+from dedup.utils import fetchiter, open_compressed_mirror_url
+
+def release_architectures(mirror):
+ with closing(urlopen("%s/dists/testing/Release" % mirror)) as f:
+ return set(deb822.Release(f)["Architectures"].split())
+
+def gather_pkgmeta(pkg):
+ meta = dict(depends=set(), provides=set())
+ for key in ("version", "filename"):
+ meta[key] = pkg[key]
+ try:
+ meta["multiarch"] = pkg["multi-arch"]
+ except KeyError:
+ pass
+ try:
+ meta["source"] = pkg["source"].split(None, 1)[0]
+ except KeyError:
+ pass
+ for kind, relations in pkg.relations.items():
+ if kind == "pre-depends":
+ kind = "depends"
+ elif kind not in ("provides", "depends"):
+ continue
+ for relation in relations:
+ if kind == "provides" and len(relation) != 1:
+ raise ValueError("provides with alternatives")
+ for alternative in relation:
+ name = alternative["name"]
+ # deb822 currently returns :any dependencies raw. see #670679
+ archqual = alternative.get("archqual")
+ if ':' in name:
+ name, archqual = name.split(None, 1)[0].split(u':')
+ if kind == "provides":
+ if archqual:
+ raise ValueError(
+ "arch qualification not supported on provides")
+ meta[kind].add(name)
+ else:
+ meta[kind].add((name, archqual))
+ return meta
+
+def process_list(mirror, architecture):
+ pkgs = dict()
+ url = "%s/dists/sid/main/binary-%s/Packages" % (mirror, architecture)
+ with closing(open_compressed_mirror_url(url)) as pkglist:
+ for pkg in deb822.Packages.iter_paragraphs(pkglist):
+ if pkg["architecture"] != architecture:
+ continue # e.g. Arch: all
+ try:
+ otherver = pkgs[pkg["package"]]["version"]
+ except KeyError:
+ pass
+ else:
+ if version_compare(otherver, pkg["version"]) < 0:
+ continue
+ pkgs[pkg["package"]] = gather_pkgmeta(pkg)
+ return pkgs
+
+def process_one_list(item):
+ mirror, arch = item
+ return (arch, process_list(mirror, arch))
+
+class ProcessingFinished(Exception):
+ pass
+
+class MultiarchExtractor(DebExtractor):
+ def __init__(self):
+ DebExtractor.__init__(self)
+ self.architecture = None
+ self.result = dict(hasscripts=False)
+
+ def handle_control_info(self, info):
+ self.architecture = info["architecture"]
+
+ def handle_control_member(self, name, content):
+ if name in ("preinst", "postinst", "prerm", "postrm"):
+ self.result["hasscripts"] = True
+
+ def handle_control_end(self):
+ if self.architecture == "all":
+ raise ProcessingFinished
+
+ def handle_data_tar(self, tarfileobj):
+ self.result["contents"] = contents = dict()
+ for elem in tarfileobj:
+ try:
+ name = decodetarname(elem.name)
+ except UnicodeDecodeError:
+ logging.warning("skipping filename with encoding error %r",
+ elem.name)
+ continue # skip files with non-utf8 encoding for now
+ if elem.isreg():
+ hasher = hashlib.sha256()
+ hash_file(hasher, tarfileobj.extractfile(elem))
+ contents[name] = "R" + hasher.hexdigest()
+ elif elem.issym() or elem.islnk():
+ try:
+ linkname = decodetarname(elem.linkname)
+ except UnicodeDecodeError:
+ logging.warning("skipping link with encoding error %r",
+ elem.linkname)
+ continue
+ if elem.issym():
+ contents[name] = u"S" + linkname
+ else:
+ try:
+ contents[name] = contents[linkname]
+ except KeyError:
+ logging.warning("hard link to non-existent file %r",
+ linkname)
+ raise ProcessingFinished
+
+def process_one_package(item):
+ pid, url = item
+ extractor = MultiarchExtractor()
+ with closing(urlopen(url)) as pkgfile:
+ try:
+ extractor.process(pkgfile)
+ except ProcessingFinished:
+ pass
+ return (pid, extractor.result, url)
+
+def consume_items(dct):
+ while True:
+ try:
+ yield dct.popitem()
+ except KeyError:
+ break
+
+def bounded_imap_unordered(bound, pool, function, iterable):
+ iterable = iter(iterable)
+ results = Queue.Queue()
+ outstanding = 0
+ while iterable or outstanding:
+ if iterable:
+ for elem in iterable:
+ pool.apply_async(function, (elem,), callback=results.put)
+ outstanding += 1
+ if outstanding >= bound or not results.empty():
+ break
+ else:
+ iterable = None
+ if outstanding:
+ yield results.get()
+ outstanding -= 1
+
+def insert_package(cur, name, arch, meta):
+ logging.debug("adding %s %s", name, meta["version"])
+ cur.execute("INSERT INTO package (name, version, architecture, source, multiarch) VALUES (?, ?, ?, ?, ?);",
+ (name, meta["version"], arch, meta.get("source", name),
+ meta.get("multiarch", "no")))
+ pid = cur.lastrowid
+ logging.debug("added %s as %d", name, pid)
+ if meta["depends"]:
+ logging.debug("inserting %d dependencies for %s",
+ len(meta["depends"]), name)
+ cur.executemany("INSERT INTO depends (pid, dependee, archqual) VALUES (?, ?, ?);",
+ ((pid, dependee, archqual)
+ for dependee, archqual in meta["depends"]))
+ if meta["provides"]:
+ logging.debug("inserting %d provides for %s",
+ len(meta["provides"]), name)
+ cur.executemany("INSERT INTO provides (pid, provided) VALUES (?, ?);",
+ ((pid, provided) for provided in meta["provides"]))
+ return pid
+
+def main():
+ logging.basicConfig(level=logging.DEBUG)
+ parser = argparse.ArgumentParser()
+ parser.add_argument("-d", "--database", action="store",
+ default="test.sqlite3",
+ help="path to the sqlite3 database file")
+ parser.add_argument("-m", "--mirror", action="store",
+ default="http://httpredir.debian.org/debian",
+ help="Debian mirror to use")
+ args = parser.parse_args()
+ architectures = release_architectures(args.mirror)
+ architectures.add("all")
+ workers = multiprocessing.cpu_count()
+ pool = multiprocessing.Pool(workers)
+ db = sqlite3.connect(args.database)
+ cur = db.cursor()
+ cur.execute("PRAGMA foreign_keys = ON;")
+ todo = {}
+ for arch, pkgs in bounded_imap_unordered(3, pool, process_one_list,
+ ((args.mirror, arch)
+ for arch in architectures)):
+ logging.info("found %d Packages for architecture %s", len(pkgs), arch)
+ cur.execute("SELECT name, id, version FROM package WHERE architecture = ?;",
+ (arch,))
+ remove = set()
+ for name, pid, version in fetchiter(cur):
+ if name in pkgs and version == pkgs[name]["version"]:
+ del pkgs[name]
+ else:
+ remove.add(pid)
+ logging.info("%d Packages are new", len(pkgs))
+ logging.info("removing %d old packages", len(remove))
+ cur.executemany("DELETE FROM package WHERE id = ?;",
+ ((pid,) for pid in remove))
+ del remove
+
+ for name, meta in pkgs.items():
+ pid = insert_package(cur, name, arch, meta)
+ todo[pid] = meta["filename"]
+ del pkgs
+
+ logging.info("need to fetch %d packages", len(todo))
+ for pid, result, url in bounded_imap_unordered(2 * workers, pool,
+ process_one_package,
+ ((pid, "%s/%s" % (args.mirror, filename))
+ for pid, filename in consume_items(todo))):
+ logging.debug("fetched %d from %s", pid, url)
+ cur.execute("UPDATE package SET hasscripts = ? WHERE id = ?;",
+ (result["hasscripts"], pid))
+ if "contents" in result:
+ cur.executemany("INSERT INTO content (pid, filename, hash) VALUES (?, ?, ?);",
+ ((pid, key, value)
+ for key, value in result["contents"].items()))
+ db.commit()
+
+if __name__ == "__main__":
+ main()
diff --git a/multiarchschema.sql b/multiarchschema.sql
new file mode 100644
index 0000000..d3dc2eb
--- /dev/null
+++ b/multiarchschema.sql
@@ -0,0 +1,31 @@
+CREATE TABLE package (
+ id INTEGER PRIMARY KEY,
+ name TEXT NOT NULL,
+ version TEXT NOT NULL,
+ architecture TEXT NOT NULL,
+ source TEXT NOT NULL,
+ hasscripts BOOL,
+ multiarch TEXT
+ CHECK (multiarch IN ('allowed', 'foreign', 'no', 'same')),
+ UNIQUE (name, architecture));
+
+CREATE TABLE depends (
+ id INTEGER PRIMARY KEY,
+ pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
+ dependee TEXT NOT NULL,
+ archqual TEXT);
+
+CREATE TABLE provides (
+ id INTEGER PRIMARY KEY,
+ pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
+ provided TEXT NOT NULL);
+
+CREATE TABLE content (
+ id INTEGER PRIMARY KEY,
+ pid INTEGER NOT NULL REFERENCES package(id) ON DELETE CASCADE,
+ filename TEXT NOT NULL,
+ hash TEXT NOT NULL);
+
+CREATE INDEX depends_pid_index ON depends(pid);
+CREATE INDEX provides_pid_index ON provides(pid);
+CREATE INDEX content_pid_index ON content(pid);