summaryrefslogtreecommitdiff
path: root/linuxnamespaces/__init__.py
diff options
context:
space:
mode:
Diffstat (limited to 'linuxnamespaces/__init__.py')
-rw-r--r--linuxnamespaces/__init__.py395
1 files changed, 137 insertions, 258 deletions
diff --git a/linuxnamespaces/__init__.py b/linuxnamespaces/__init__.py
index 8c1def3..83358b6 100644
--- a/linuxnamespaces/__init__.py
+++ b/linuxnamespaces/__init__.py
@@ -6,253 +6,61 @@ Python.
"""
import asyncio
-import bisect
import contextlib
-import dataclasses
import errno
import fcntl
+import logging
import os
import pathlib
import socket
import stat
import struct
-import subprocess
import typing
from .filedescriptor import *
+from .idmap import *
from .atlocation import *
from .syscalls import *
-def subidranges(
- kind: typing.Literal["uid", "gid"], login: str | None = None
-) -> typing.Iterator[tuple[int, int]]:
- """Parse a `/etc/sub?id` file for ranges allocated to the given or current
- user. Return all ranges as (start, count) pairs.
- """
- if login is None:
- login = os.getlogin()
- with open(f"/etc/sub{kind}") as filelike:
- for line in filelike:
- parts = line.strip().split(":")
- if parts[0] == login:
- yield (int(parts[1]), int(parts[2]))
-
-
-@dataclasses.dataclass(frozen=True)
-class IDMapping:
- """Represent one range in a user or group id mapping."""
-
- innerstart: int
- outerstart: int
- count: int
-
- def __post_init__(self) -> None:
- if self.outerstart < 0:
- raise ValueError("outerstart must not be negative")
- if self.innerstart < 0:
- raise ValueError("innerstart must not be negative")
- if self.count <= 0:
- raise ValueError("count must be positive")
- if self.outerstart + self.count >= 1 << 64:
- raise ValueError("outerstart + count exceed 64bits")
- if self.innerstart + self.count >= 1 << 64:
- raise ValueError("innerstart + count exceed 64bits")
-
-
-class IDAllocation:
- """This represents a subset of IDs (user or group). It can be used to
- allocate a contiguous range for use with a user namespace.
- """
-
- def __init__(self) -> None:
- self.ranges: list[tuple[int, int]] = []
-
- def add_range(self, start: int, count: int) -> None:
- """Add count ids starting from start to this allocation."""
- if start < 0 or count <= 0:
- raise ValueError("invalid range")
- index = bisect.bisect_right(self.ranges, (start, 0))
- prevrange = None
- if index > 0:
- prevrange = self.ranges[index - 1]
- if prevrange[0] + prevrange[1] > start:
- raise ValueError("attempt to add overlapping range")
- nextrange = None
- if index < len(self.ranges):
- nextrange = self.ranges[index]
- if nextrange[0] < start + count:
- raise ValueError("attempt to add overlapping range")
- if prevrange and prevrange[0] + prevrange[1] == start:
- if nextrange and nextrange[0] == start + count:
- self.ranges[index - 1] = (
- prevrange[0],
- prevrange[1] + count + nextrange[1],
- )
- del self.ranges[index]
- else:
- self.ranges[index - 1] = (prevrange[0], prevrange[1] + count)
- elif nextrange and nextrange[0] == start + count:
- self.ranges[index] = (start, count + nextrange[1])
- else:
- self.ranges.insert(index, (start, count))
-
- @classmethod
- def loadsubid(
- cls, kind: typing.Literal["uid", "gid"], login: str | None = None,
- ) -> "IDAllocation":
- """Load a `/etc/sub?id` file and return ids allocated to the given
- login or current user.
- """
- self = cls()
- for start, count in subidranges(kind, login):
- self.add_range(start, count)
- return self
-
- def find(self, count: int) -> int:
- """Locate count contiguous ids from this allocation. The start of
- the allocation is returned. The allocation object is left unchanged.
- """
- for start, available in self.ranges:
- if available >= count:
- return start
- raise ValueError("could not satisfy allocation request")
-
- def allocate(self, count: int) -> int:
- """Allocate count contiguous ids from this allocation. The start of
- the allocation is returned and the ids are removed from this
- IDAllocation object.
- """
- for index, (start, available) in enumerate(self.ranges):
- if available > count:
- self.ranges[index] = (start + count, available - count)
- return start
- if available == count:
- del self.ranges[index]
- return start
- raise ValueError("could not satisfy allocation request")
-
- def allocatemap(self, count: int, target: int = 0) -> IDMapping:
- """Allocate count contiguous ids from this allocation. An IDMapping
- with its innerstart set to target is returned. The allocation is
- removed from this IDAllocation object.
- """
- return IDMapping(target, self.allocate(count), count)
-
- def reserve(self, start: int, count: int) -> None:
- """Reserve (and remove) the given range from this allocation. If the
- range is not fully contained in this allocation, a ValueError is
- raised.
- """
- if count < 0:
- raise ValueError("negative count")
- index = bisect.bisect_right(self.ranges, (start, float("inf"))) - 1
- if index < 0:
- raise ValueError("range to reserve not found")
- cur_start, cur_count = self.ranges[index]
- assert cur_start <= start
- if cur_start == start:
- # Requested range starts at range boundary
- if cur_count < count:
- raise ValueError("range to reserve not found")
- if cur_count == count:
- # Requested range matches a range exactly
- del self.ranges[index]
- else:
- # Requested range is a head of the matched range
- self.ranges[index] = (start + count, cur_count - count)
- elif cur_start + cur_count >= start + count:
- # Requested range fits into a matched range
- self.ranges[index] = (cur_start, start - cur_start)
- if cur_start + cur_count > start + count:
- # Requested range punches a hole into a matched range
- self.ranges.insert(
- index + 1,
- (start + count, cur_start + cur_count - (start + count)),
- )
- # else: Requested range is a tail of a matched range
- else:
- raise ValueError("range to reserve not found")
-
-
-def newidmap(
- kind: typing.Literal["uid", "gid"],
- pid: int,
- mapping: list[IDMapping],
- helper: bool | None = None,
-) -> None:
- """Apply the given uid or gid mapping to the given process. A positive pid
- identifies a process, other values identify the currently running process.
- Whether setuid binaries newuidmap and newgidmap are used is determined via
- the helper argument. A None value indicate automatic detection of whether
- a helper is required for setting up the given mapping.
- """
-
- assert kind in ("uid", "gid")
- if pid <= 0:
- pid = os.getpid()
- if helper is None:
- # We cannot reliably test whether we have the right EUID and we don't
- # implement checking whether setgroups has been denied either. Please
- # be explicit about the helper choice in such cases.
- helper = len(mapping) > 1 or mapping[0].count > 1
- if helper:
- argv = [f"new{kind}map", str(pid)]
- for idblock in mapping:
- argv.extend(map(str, dataclasses.astuple(idblock)))
- subprocess.check_call(argv)
- else:
- pathlib.Path(f"/proc/{pid}/{kind}_map").write_text(
- "".join(
- "%d %d %d\n" % dataclasses.astuple(idblock)
- for idblock in mapping
- ),
- encoding="ascii",
- )
-
-
-def newuidmap(pid: int, mapping: list[IDMapping], helper: bool = True) -> None:
- """Apply a given uid mapping to the given process. Refer to newidmap for
- details.
- """
- newidmap("uid", pid, mapping, helper)
-
-
-def newgidmap(pid: int, mapping: list[IDMapping], helper: bool = True) -> None:
- """Apply a given gid mapping to the given process. Refer to newidmap for
- details.
- """
- newidmap("gid", pid, mapping, helper)
-
-
-def newidmaps(
- pid: int,
- uidmapping: list[IDMapping],
- gidmapping: list[IDMapping],
- helper: bool = True,
-) -> None:
- """Apply a given uid and gid mapping to the given process. Refer to
- newidmap for details.
- """
- newgidmap(pid, gidmapping, helper)
- newuidmap(pid, uidmapping, helper)
+_logger = logging.getLogger(__name__)
class run_in_fork:
"""Decorator for running the decorated function once in a separate process.
"""
- def __init__(self, function: typing.Callable[[], None]):
- """Fork a new process that will eventually run the given function and
- then exit.
+ def __init__(
+ self, function: typing.Callable[[], None], start: bool = False
+ ):
+ """Fork a new process that will run the given function and then exit.
+ If start is true, run it immediately, otherwise the start or __call__
+ method should be used.
"""
- self.efd = EventFD()
+ self.efd = None if start else EventFD()
self.pid = os.fork()
if self.pid == 0:
- self.efd.read()
- self.efd.close()
- function()
- os._exit(0)
+ code = 0
+ try:
+ if self.efd is not None:
+ self.efd.read()
+ self.efd.close()
+ self.efd = None
+ function()
+ except SystemExit as err:
+ code = err.code
+ except:
+ _logger.exception(
+ "uncaught exception in run_in_fork %r", function
+ )
+ code = 1
+ os._exit(code)
+
+ @classmethod
+ def now(cls, function: typing.Callable[[], None]) -> typing.Self:
+ """Fork a new process that will immediately run the given function and
+ then exit."""
+ return cls(function, start=True)
def start(self) -> None:
"""Start the decorated function. It can only be started once."""
@@ -260,6 +68,7 @@ class run_in_fork:
raise ValueError("this function can only be called once")
self.efd.write(1)
self.efd.close()
+ self.efd = None
def wait(self) -> None:
"""Wait for the process running the decorated function to finish."""
@@ -270,8 +79,11 @@ class run_in_fork:
raise ValueError("something failed")
def __call__(self) -> None:
- """Start the decorated function and wait for its process to finish."""
- self.start()
+ """Start the decorated function if needed and wait for its process to
+ finish.
+ """
+ if self.efd:
+ self.start()
self.wait()
@@ -282,9 +94,12 @@ class async_run_in_fork:
synchronous and it must not access the event loop of the main process.
"""
- def __init__(self, function: typing.Callable[[], None]):
- """Fork a new process that will eventually run the given function and
- then exit.
+ def __init__(
+ self, function: typing.Callable[[], None], start: bool = False
+ ):
+ """Fork a new process that will run the given function and then exit.
+ If start is true, run it immediately, otherwise the start or __call__
+ method should be used.
"""
loop = asyncio.get_running_loop()
with asyncio.get_child_watcher() as watcher:
@@ -293,15 +108,33 @@ class async_run_in_fork:
"active child watcher required for creating a process"
)
self.future = loop.create_future()
- self.efd = EventFD()
+ self.efd = None if start else EventFD()
self.pid = os.fork()
if self.pid == 0:
- self.efd.read()
- self.efd.close()
- function()
- os._exit(0)
+ code = 0
+ try:
+ if self.efd:
+ self.efd.read()
+ self.efd.close()
+ self.efd = None
+ asyncio.set_event_loop(None)
+ function()
+ except SystemExit as err:
+ code = err.code
+ except:
+ _logger.exception(
+ "uncaught exception in run_in_fork %r", function
+ )
+ code = 1
+ os._exit(code)
watcher.add_child_handler(self.pid, self._child_callback)
+ @classmethod
+ def now(cls, function: typing.Callable[[], None]) -> typing.Self:
+ """Fork a new process that will immediately run the given function and
+ then exit."""
+ return cls(function, start=True)
+
def _child_callback(self, pid: int, returncode: int) -> None:
if self.pid != pid:
return
@@ -313,6 +146,7 @@ class async_run_in_fork:
raise ValueError("this function can only be called once")
self.efd.write(1)
self.efd.close()
+ self.efd = None
async def wait(self) -> None:
"""Wait for the process running the decorated function to finish."""
@@ -323,8 +157,11 @@ class async_run_in_fork:
raise ValueError("something failed")
async def __call__(self) -> None:
- """Start the decorated function and wait for its process to finish."""
- self.start()
+ """Start the decorated function if needed and wait for its process to
+ finish.
+ """
+ if self.efd:
+ self.start()
await self.wait()
@@ -348,7 +185,7 @@ def bind_mount(
srcloc = os.fspath(source)
tgtloc = os.fspath(target)
except ValueError:
- otflags = OpenTreeFlags.OPEN_TREE_CLONE
+ otflags = OpenTreeFlags.CLONE
if recursive:
otflags |= OpenTreeFlags.AT_RECURSIVE
with open_tree(source, otflags) as srcfd:
@@ -362,6 +199,17 @@ def bind_mount(
mount(srcloc, tgtloc, None, mflags)
+def get_cgroup(pid: int = -1) -> pathlib.PurePath:
+ """Look up the cgroup that the given pid or the current process belongs
+ to.
+ """
+ return pathlib.PurePath(
+ pathlib.Path(
+ f"/proc/{pid}/cgroup" if pid > 0 else "/proc/self/cgroup"
+ ).read_text().split(":", 2)[2].strip()
+ )
+
+
_P = typing.ParamSpec("_P")
class _ExceptionExitCallback:
@@ -397,7 +245,7 @@ def populate_dev(
newroot: PathConvertible,
*,
fuse: bool = True,
- pidns: bool = True,
+ pts: typing.Literal["defer", "host", "new", "absent"] = "new",
tun: bool = True,
) -> None:
"""Mount a tmpfs to the dev directory beneath newroot and populate it with
@@ -407,6 +255,12 @@ def populate_dev(
Even though a CAP_SYS_ADMIN-enabled process can umount components of the
/dev hierarchy, they they cannot gain privileges in doing so as no
hierarchies are restricted via tmpfs mounts or read-only bind mounts.
+
+ The /dev/fuse and /dev/net/tun devices are optional and can be enabled or
+ disabled as desired. /dev/pts (and /dev/ptmx) can be shared with the host
+ or mounted as a new instance. Since a PID namespace is usually required for
+ mounting a new instance, it can also be deferred to a later manual mount.
+ If not desired, it can be left absent.
"""
origdev = AtLocation(origroot) / "dev"
newdev = AtLocation(newroot) / "dev"
@@ -423,31 +277,31 @@ def populate_dev(
for fn in "null zero full random urandom tty".split():
files.add(fn)
bind_mounts[fn] = exitstack.enter_context(
- open_tree(origdev / fn, OpenTreeFlags.OPEN_TREE_CLONE)
+ open_tree(origdev / fn, OpenTreeFlags.CLONE)
)
if fuse:
files.add("fuse")
bind_mounts["fuse"] = exitstack.enter_context(
- open_tree(origdev / "fuse", OpenTreeFlags.OPEN_TREE_CLONE)
+ open_tree(origdev / "fuse", OpenTreeFlags.CLONE)
)
- if pidns:
- symlinks["ptmx"] = "pts/ptmx"
- else:
+ if pts == "host":
bind_mounts["pts"] = exitstack.enter_context(
open_tree(
origdev / "pts",
- OpenTreeFlags.AT_RECURSIVE | OpenTreeFlags.OPEN_TREE_CLONE,
+ OpenTreeFlags.AT_RECURSIVE | OpenTreeFlags.CLONE,
)
)
files.add("ptmx")
bind_mounts["ptmx"] = exitstack.enter_context(
- open_tree(origdev / "ptmx", OpenTreeFlags.OPEN_TREE_CLONE)
+ open_tree(origdev / "ptmx", OpenTreeFlags.CLONE)
)
+ elif pts != "absent":
+ symlinks["ptmx"] = "pts/ptmx"
if tun:
directories.add("net")
files.add("net/tun")
bind_mounts["net/tun"] = exitstack.enter_context(
- open_tree(origdev / "net/tun", OpenTreeFlags.OPEN_TREE_CLONE)
+ open_tree(origdev / "net/tun", OpenTreeFlags.CLONE)
)
mount(
"devtmpfs",
@@ -472,7 +326,7 @@ def populate_dev(
(newdev / fn).mknod(stat.S_IFREG)
for fn, target in symlinks.items():
(newdev / fn).symlink_to(target)
- if pidns:
+ if pts == "new":
mount(
"devpts",
newdev / "pts",
@@ -514,7 +368,7 @@ def populate_proc(
if namespaces & CloneFlags.NEWNET == CloneFlags.NEWNET:
psn = open_tree(
newproc / "sys/net",
- OpenTreeFlags.OPEN_TREE_CLONE | OpenTreeFlags.AT_RECURSIVE,
+ OpenTreeFlags.CLONE | OpenTreeFlags.AT_RECURSIVE,
)
bind_mount(newproc / "sys", newproc / "sys", True, True)
if psn is not None:
@@ -570,7 +424,7 @@ def populate_sys(
bindfd = exitstack.enter_context(
open_tree(
AtLocation(origroot) / "sys" / source,
- OpenTreeFlags.OPEN_TREE_CLONE | OpenTreeFlags.AT_RECURSIVE,
+ OpenTreeFlags.CLONE | OpenTreeFlags.AT_RECURSIVE,
),
)
if rdonly:
@@ -612,8 +466,13 @@ def unshare_user_idmap(
unshare(flags)
setup_idmaps()
+
def unshare_user_idmap_nohelper(
- uid: int, gid: int, flags: CloneFlags = CloneFlags.NEWUSER
+ uid: int,
+ gid: int,
+ flags: CloneFlags = CloneFlags.NEWUSER,
+ *,
+ proc: AtLocationLike | None = None,
) -> None:
"""Unshare the given namespaces (must include user) and
map the current user and group to the given uid and gid
@@ -622,14 +481,20 @@ def unshare_user_idmap_nohelper(
uidmap = IDMapping(uid, os.getuid(), 1)
gidmap = IDMapping(gid, os.getgid(), 1)
unshare(flags)
- pathlib.Path("/proc/self/setgroups").write_bytes(b"deny")
- newidmaps(-1, [uidmap], [gidmap], False)
+ proc = AtLocation("/proc" if proc is None else proc)
+ (proc / "self/setgroups").write_bytes(b"deny")
+ newidmaps(-1, [uidmap], [gidmap], False, proc=proc)
class _AsyncFilesender:
bs = 65536
- def __init__(self, from_fd: int, to_fd: int, count: int | None = None):
+ def __init__(
+ self,
+ from_fd: FileDescriptor,
+ to_fd: FileDescriptor,
+ count: int | None = None,
+ ):
self.from_fd = from_fd
self.to_fd = to_fd
self.copied = 0
@@ -662,7 +527,12 @@ class _AsyncFilesender:
class _AsyncSplicer:
bs = 65536
- def __init__(self, from_fd: int, to_fd: int, count: int | None = None):
+ def __init__(
+ self,
+ from_fd: FileDescriptor,
+ to_fd: FileDescriptor,
+ count: int | None = None,
+ ):
self.from_fd = from_fd
self.to_fd = to_fd
self.copied = 0
@@ -706,7 +576,12 @@ class _AsyncSplicer:
class _AsyncCopier:
bs = 65536
- def __init__(self, from_fd: int, to_fd: int, count: int | None = None):
+ def __init__(
+ self,
+ from_fd: FileDescriptor,
+ to_fd: FileDescriptor,
+ count: int | None = None,
+ ):
self.from_fd = from_fd
self.to_fd = to_fd
self.buffer = b""
@@ -770,13 +645,17 @@ class _AsyncCopier:
def async_copyfd(
- from_fd: int, to_fd: int, count: int | None = None
+ from_fd: FileDescriptorLike,
+ to_fd: FileDescriptorLike,
+ count: int | None = None,
) -> asyncio.Future[int]:
"""Copy the given number of bytes from the first file descriptor to the
second file descriptor in an asyncio context. Both copies are performed
binary. An efficient implementation is chosen depending on the file type
of file descriptors.
"""
+ from_fd = FileDescriptor(from_fd)
+ to_fd = FileDescriptor(to_fd)
from_mode = os.fstat(from_fd).st_mode
if stat.S_ISREG(from_mode):
return _AsyncFilesender(from_fd, to_fd, count).fut
@@ -786,7 +665,7 @@ def async_copyfd(
class _AsyncPidfdWaiter:
- def __init__(self, pidfd: int, flags: int):
+ def __init__(self, pidfd: FileDescriptor, flags: int):
self.pidfd = pidfd
self.flags = flags
self.loop = asyncio.get_running_loop()
@@ -811,12 +690,12 @@ class _AsyncPidfdWaiter:
def async_waitpidfd(
- pidfd: int, flags: int
+ pidfd: FileDescriptorLike, flags: int
) -> asyncio.Future[os.waitid_result | None]:
"""Asynchronously wait for a process represented as a pidfd. This is an
async variant of waitid(P_PIDFD, pidfd, flags).
"""
- return _AsyncPidfdWaiter(pidfd, flags).fut
+ return _AsyncPidfdWaiter(FileDescriptor(pidfd), flags).fut
def enable_loopback_if() -> None: