Repository: arachsys/containers
Branch: master
Commit: 6c3f866286f6
Files: 13
Total size: 43.4 KB

Directory structure:
gitextract_ylyp0gun/

├── .gitignore
├── COPYING
├── Makefile
├── README
├── TIPS
├── console.c
├── contain.c
├── contain.h
├── inject.c
├── map.c
├── mount.c
├── pseudo.c
└── util.c

================================================
FILE CONTENTS
================================================

================================================
FILE: .gitignore
================================================
*.o
/contain
/inject
/pseudo
/tags


================================================
FILE: COPYING
================================================
Copyright (C) 2013 Chris Webb <chris@arachsys.com>

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to
deal in the Software without restriction, including without limitation the
rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
sell copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.


================================================
FILE: Makefile
================================================
BINDIR := $(PREFIX)/bin
CFLAGS := -Os -Wall -Wfatal-errors

BINARIES := inject
SUIDROOT := contain pseudo

%:: %.c Makefile
	$(CC) $(CFLAGS) -o $@ $(filter %.c,$^)

all: $(BINARIES) $(SUIDROOT)

contain: contain.[ch] console.c map.c mount.c util.c

inject: contain.h inject.c map.c util.c

pseudo: contain.h pseudo.c map.c util.c

clean:
	rm -f $(BINARIES) $(SUIDROOT)

install: $(BINARIES) $(SUIDROOT)
	mkdir -p $(DESTDIR)$(BINDIR)
	install -s $(BINARIES) $(DESTDIR)$(BINDIR)
	install -o root -g root -m 4755 -s $(SUIDROOT) $(DESTDIR)$(BINDIR)

.PHONY: all clean install


================================================
FILE: README
================================================
Containers
==========

This package is a simple implementation of containers for Linux, making
secure containers as easy to create and use as a traditional chroot. It
comprises three utilities, contain, inject and pseudo, which use the kernel
support for user namespaces merged in Linux 3.8.


Demonstration
-------------

With the utilities already installed, the demo begins in an unprivileged
user's shell:

  $ echo $$ $UID
  21260 1000

To create a simple test container, copy /bin and /lib* from the host into a
temporary directory with the default UID/GID mappings applied:

  $ cd $(mktemp -d)
  $ tar -c -f - -C / bin lib lib32 lib64 | pseudo tar -x -f -

It is very straightforward to launch a container with this newly-created
root filesystem:

  $ contain . /bin/bash
  #

The new shell has PID 1 within the container, and cannot see other processes
on the host:

  # echo $$ $UID
  1 0
  # ps ax
    PID TTY      STAT   TIME COMMAND
      1 console  Ss     0:00 /bin/bash
      2 console  R+     0:00 ps ax

The container root user is able to manipulate ownerships and permissions
within its filesystem:

  # ls -l /dev/console
  crw--w---- 1 0 5 136, 9 Jul  1 14:00 /dev/console
  # chown 12:34 /dev/console
  # chmod a+rw /dev/console
  # ls -l /dev/console
  crw-rw-rw- 1 12 34 136, 9 Jul  1 14:00 /dev/console

and can also make other privileged changes such as setting the hostname:

  # echo -n "hostname $(hostname) -> " && hostname brian && hostname
  hostname alice -> brian

or configuring the network stack:

  # ip link show
  1: lo: <LOOPBACK> mtu 65536 qdisc noop state DOWN mode DEFAULT
      link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
  # ping -w 1 1.2.3.4 &>/dev/null && echo up || echo down
  down
  # ip addr add 1.2.3.4/32 dev lo && ip link set lo up
  # ping -w 1 1.2.3.4 &>/dev/null && echo up || echo down
  up
  # ip link add type veth && ip link show
  1: lo: <LOOPBACK,UP,LOWER_UP> mtu 65536 qdisc noqueue state UNKNOWN mode DEFAULT
      link/loopback 00:00:00:00:00:00 brd 00:00:00:00:00:00
  2: veth0: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT qlen 1000
      link/ether 3a:0c:96:36:2d:ff brd ff:ff:ff:ff:ff:ff
  3: veth1: <BROADCAST,MULTICAST> mtu 1500 qdisc noop state DOWN mode DEFAULT qlen 1000
      link/ether a2:86:1a:92:58:cb brd ff:ff:ff:ff:ff:ff

In all cases, these changes affect the container but not the host as a
whole. Processes in the container live in different resource namespaces
isolated from the host, and the container root user is unable to do anything
that would require elevated capabilities or root privilege on the host
itself.


contain
-------

The contain utility is invoked as

  contain [OPTIONS] DIR [CMD [ARG]...]

with options

  -c        disable console emulation in the container
  -g MAP    set the container-to-host GID map
  -i CMD    run a helper child inside the new namespaces
  -n        share the host network unprivileged in the container
  -o CMD    run a helper child outside the new namespaces
  -u MAP    set the container-to-host UID map

and creates a new container with DIR recursively bound as its root
filesystem, running CMD as PID 1 within that container. If unspecified, CMD
defaults to /bin/sh to start a shell, so to fully boot a distribution,
specify CMD as /bin/init or /sbin/init.

The container init process is isolated in new user, cgroup, mount, IPC, UTS,
time and PID namespaces. A synthetic /dev with device nodes bound from the
host /dev is automatically mounted within the new mount namespace, together
with standard /dev/pts, /proc and /sys filesystems.

Because it runs in its own user namespace, users and groups seen inside a
container are not the same as the underlying credentials visible for the
same processes and files on the host. Sensible default container-to-host UID
and GID mappings are provided and described below, but the -u and -g options
can be used to override the defaults.

The container console is a host pseudo-terminal bound at /dev/console in the
new /dev filesystem: stdin and stdout are copied to/from this, and it serves
as stdin, stdout and stderr for the container init process. This console
emulation can be disabled using the -c option: if -c is used, init is run
directly with the stdin, stdout and stderr of the contain command.

Containers are usually isolated in their own network namespace, with a
distinct set of network interfaces from the host. By specifying the -n
option, it is possible to safely share the host network stack instead. If
you do this, user networking within the container will work normally, but
the container has no privileges with respect to its network namespace so it
isn't possible to (re)configure interfaces or routes, and setuid utilities
like ping which use a raw socket will fail.

Two different kinds of helper program can be used to help set up a
container. A program specified with -i is run inside the new namespaces with
the new root filesystem as its working directory, just before pivoting into
it. Typically this type of helper is used to bind-mount additional parts of
the host filesystem inside the container.

A helper specified with -o is run outside the namespaces but as a direct
child of the supervisor process which is running within them. This type of
helper can be used to move host network interfaces (such as a macvtap
interface or one half of a veth pair) into the container's network
namespace.

The environment of the container init process includes "container=contain"
so that distributions can identify when they are running under contain.


inject
------

The inject utility is invoked as

  inject PID [CMD [ARG]...]

where PID is the process ID of a running container supervisor, and runs a
command or shell inside the existing container. The environment, stdin,
stdout and stderr of inject are all inherited by the command to be run.

The container supervisor PID (i.e. that of contain itself) should be given
to inject, not the PID of the descendant init process. The inject utility
will only work if the process specified has a child with "container=contain"
in its environment, which it assumes to be the container init.

Linux allows an unprivileged user to join the user namespace of any process
he can dump or ptrace, so inject need not be installed setuid even if
contain and pseudo are setuid root. It will refuse to run if it detects
setuid/setgid operation.


pseudo
------

The pseudo utility is invoked as

  pseudo [OPTIONS] [CMD [ARG]...]

with options

  -g MAP    set the user namespace GID map
  -u MAP    set the user namespace UID map

and runs a command or shell as root in a new user namespace, by analogy with
sudo which runs a command as root in the host user namespace.

Unlike contain, pseudo does not unshare other namespaces or attempt to
isolate the new process from the rest of the host. It has identical default
UID/GID mappings, -u and -g options, and support for /etc/subuid and
/etc/subgid when installed setuid root, but no other contain options are
supported.

One use for pseudo is as a more capable replacement for fakeroot, useful for
testing, when building software packages or for constructing system images.
Unlike the traditional fakeroot approach based on LD_PRELOAD, static
binaries and chroot jails are both handled correctly.

It is also invaluable for running host software to access the same
filesystem as a container, replicating the user and group file ownerships
that the container would see. For example, in the demo above, the system
image is untarred under pseudo so that files are written into the filesystem
with UIDs and GIDs mapped for the container rather than unmapped as on the
host.


User and group mappings
-----------------------

By default, when run as root, contain and pseudo will map container UID/GID
0 onto the highest available host UID/GID (4294967294 unless nested), and
all other UIDs/GIDs are mapped onto themselves apart from the top container
UID and GID which must be left unmapped.

The default mappings avoid host UID and GID 0 as the host root user is still
granted a variety of privileges even after dropping all capabilities in the
host user namespace. For example, /proc and /sys files typically have (host)
root:root ownership, and allowing the container access unfiltered access to
things like /proc/sys is dangerous.

Run as an unprivileged user, container UID/GID 0 is mapped onto the
unprivileged user's UID/GID, then container UIDs/GIDs 1, 2, etc. are
successively mapped onto any ranges delegated to that user in /etc/subuid
and /etc/subgid.

The -u and -g options can be used to specify custom mappings, in the format
START:LOWER:COUNT[,START:LOWER:COUNT]... where START is the first UID/GID in
a container range, LOWER is the first UID/GID in the corresponding range in
the host, and COUNT is the length of these ranges.

For example, -u 0:1000:1,1:4000:2000 will map container UID 0 onto host UID
1000 and container UIDs 1...2000 onto host UIDs 4000...5999.

It is not possible to map more than one container ID onto a given host ID,
nor to list the same container ID twice in a map specification. When invoked
by an unprivileged user, all host ranges are checked against /etc/subuid and
/etc/subgid.

Unmapped users and groups are mapped by the kernel onto the overflow UID and
GID set in /proc/sys/kernel/overflowuid and /proc/sys/kernel/overflowgid. By
default the kernel sets both these values to 65534.


Unprivileged operation, /etc/subuid and /etc/subgid
---------------------------------------------------

When a non-root user runs contain or pseudo unprivileged, these tools can
only map container UID/GIDs onto the host UID/GID of that user. The
resulting container is not very useful as it has just a single user and
group available. (Typically only root is mapped in the container.)

However, contain and pseudo can also be installed setuid root, and in this
case, unprivileged users can also map onto ranges of UIDs/GIDs that have
been delegated for their use in /etc/subuid and /etc/subgid.

The format of these files is similar to /etc/passwd, /etc/group and
/etc/shadow. Each line specifies an additional range of UIDs/GIDs allocated
to a particular user, and there can be zero, one, or multiple lines for any
given user. There are three colon-delimited fields: the user's login name,
the first UID/GID in the range, and the number of UIDs/GIDs in the range.
For example, an /etc/subuid containing the lines

  chris:100000:10000
  chris:120000:10000

allocates UID ranges 100000-109999 and 120000-129999 to my user 'chris' in
addition to my normal login UID.

The kernel user namespace author Eric Biederman <ebiederm@xmission.com> has
proposed patches against the standard GNU/Linux Shadow package which add
support for creating and updating these files in this format; they are
likely to become a standard way to delegate sub-users and sub-groups.

Linux 3.19 and later do not allow unprivileged processes to write a GID map
unless the setgroups() call has been permanently disabled by writing "deny"
to /proc/PID/setgroups. This is a fix for CVE-2014-8989 which applied to
strangely-configured systems where group membership implies more restricted
permissions rather than supplementary permissions.

As a result, when run non-setuid by an unprivileged user, contain and pseudo
must disable setgroups() in the container. Conversely, when installed setuid
root, they will use their privilege to bypass this kernel restriction,
resulting in fully-functional containers which still support setgroups().
However, this also means that they can be used to bypass restrictions
implemented by group membership.


Building and installing
-----------------------

Unpack the source tar.gz file and change to the unpacked directory.

Run 'make', then 'make install' as root to install both binaries setuid root
in /bin. Alternatively, you can set DESTDIR and/or BINDIR to install in a
different location, or strip and copy the compiled binaries into the correct
place manually.

Note that setuid contain and pseudo effectively enable unprivileged users to
to drop supplementary group memberships using setgroups(). Consequently,
they should NOT be installed setuid root on systems where group membership
implies more restricted permissions rather than supplementary permissions.

These utilities were developed on GNU/Linux and are not portable to other
platforms as they rely on Linux-specific facilities such as namespaces.
Please report any problems or bugs to Chris Webb <chris@arachsys.com>.


Copying
-------

This software was written by Chris Webb <chris@arachsys.com> and is
distributed as Free Software under the terms of the MIT license in COPYING.


================================================
FILE: TIPS
================================================
Shutting down or killing a container
------------------------------------

From the host, the inject utility can be used to run an appropriate command
within the container to start a graceful shut down. For example

  inject PID /bin/halt

To immediately kill a container and all its processes, it is sufficient to
send the init process a SIGKILL from the host using

  pkill -KILL -P PID

where PID is the process ID of a running container supervisor. It is very
important not to SIGKILL the container supervisor itself or the container
will be orphaned, continuing to run unsupervised as a child of the host
init.


Using cgroups to limit memory and processes available to a container
--------------------------------------------------------------------

If cgroup support, the memory controller and the pids controller are
compiled into the kernel, a mounted cgroup2 filesystem can be used to apply
memory and process-count limits to a container as it is started. For
example, the shell script

  #!/bin/sh -e
  echo +memory +pids >/sys/fs/cgroup/cgroup.subtree_control
  mkdir /sys/fs/cgroup/mycontainer
  echo $$ >/sys/fs/cgroup/mycontainer/tasks
  echo 2G >/sys/fs/cgroup/mycontainer/memory.high
  echo 3G >/sys/fs/cgroup/mycontainer/memory.max
  echo 2G >/sys/fs/cgroup/mycontainer/memory.swap.max
  echo 256 >sys/fs/cgroup/mycontainer/pids.max
  exec contain [...]

applies a best-efforts limit of 2GB memory with a hard limit of 3GB. Swap
usage is restricted to at most 2G, and no more than 256 process can be
forked within the container.

In addition, if contain is built and run on Linux 4.6 or later, a cgroup
namespace will be used to virtualise the container's view of the cgroup
hierarchy in /sys/fs/cgroup and /proc/*/cgroup. /sys/fs/cgroup/mycontainer
will appear as the root of the hierarchy at /sys/fs/cgroup within the
container.

See linux/kernel/Documentation/cgroup-v2.txt for detailed info on the
available controllers and configuration parameters.


Troubleshooting
---------------

The contain/psuedo error message 'Failed to unshare user namespace: Invalid
argument' typically means that your kernel is not compiled with support for
user namespaces, i.e. CONFIG_USER_NS is not set. The contain tool will also
die with a similar message referring to one of the other required namespaces
if support for that is not available in the kernel.

To run these tools you need to be running Linux 3.8 or later with

  CONFIG_CGROUPS=y
  CONFIG_UTS_NS=y
  CONFIG_TIME_NS=y
  CONFIG_IPC_NS=y
  CONFIG_USER_NS=y
  CONFIG_PID_NS=y
  CONFIG_NET_NS=y

set in the kernel build config. Note that before Linux 3.12, CONFIG_XFS_FS
conflicted with CONFIG_USER_NS, so these tools could not be used where XFS
support was compiled either into the kernel or as a module.

The contain tool will fail to mount /dev/pts unless

  CONFIG_DEVPTS_MULTIPLE_INSTANCES=y

is set in the kernel build config. Both container and host /dev/pts must be
mounted with -o newinstance, with /dev/ptmx symlinked to pts/ptmx.

Linux 3.12 introduced tighter restrictions on mounting proc and sysfs, which
broke older versions of contain. To comply with these new rules, contain
now ensures that procfs and sysfs are mounted in the new mount namespace
before pivoting into the container and detaching the host root.

A bug in Linux 3.12 will prevent contain from mounting /proc in a container
if binfmt_misc is mounted on /proc/sys/fs/binfmt_misc in the host
filesystem. This was fixed in Linux 3.13.

Linux 3.19 introduced restrictions on writing a user namespace GID map as an
unprivileged user unless setgroups() has been permanently disabled, which
broke older versions of contain. Run non-setuid and unprivileged, contain
and pseudo must now disable setgroups() to create containers, but if they
are installed setuid, they will bypass this kernel restriction and leave
setgroups() enabled in the resulting containers.


================================================
FILE: console.c
================================================
#define _GNU_SOURCE
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <limits.h>
#include <poll.h>
#include <signal.h>
#include <stdlib.h>
#include <termios.h>
#include <unistd.h>
#include <sys/ioctl.h>
#include <sys/signalfd.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/wait.h>
#include "contain.h"

static struct termios saved;

int getconsole(void) {
  int master, null;

  if ((null = open("/dev/null", O_RDWR)) < 0)
    errx(EXIT_FAILURE, "Failed to open /dev/null");

  if (fcntl(STDIN_FILENO, F_GETFD) < 0)
    dup2(null, STDIN_FILENO);
  if (fcntl(STDOUT_FILENO, F_GETFD) < 0)
    dup2(null, STDOUT_FILENO);
  if (fcntl(STDERR_FILENO, F_GETFD) < 0)
    dup2(null, STDERR_FILENO);

  if (null != STDIN_FILENO)
    if (null != STDOUT_FILENO)
      if (null != STDERR_FILENO)
        close(null);

  if ((master = posix_openpt(O_RDWR | O_NOCTTY)) < 0)
    errx(EXIT_FAILURE, "Failed to allocate a console pseudo-terminal");
  grantpt(master);
  unlockpt(master);
  return master;
}

static void rawmode(void) {
  struct termios termios;

  if (!isatty(STDIN_FILENO))
    return;
  if (tcgetattr(STDIN_FILENO, &termios) < 0)
    err(EXIT_FAILURE, "tcgetattr");
  cfmakeraw(&termios);
  tcsetattr(STDIN_FILENO, TCSANOW, &termios);
}

static void restoremode(void) {
  if (isatty(STDIN_FILENO))
    tcsetattr(STDIN_FILENO, TCSANOW, &saved);
}

static void savemode(void) {
  if (isatty(STDIN_FILENO) && tcgetattr(STDIN_FILENO, &saved) < 0)
    err(EXIT_FAILURE, "tcgetattr");
}

void setconsole(char *name) {
  int console;
  struct termios termios;

  setsid();

  if ((console = open(name, O_RDWR)) < 0)
    errx(EXIT_FAILURE, "Failed to open console in container");
  ioctl(console, TIOCSCTTY, NULL);

  if (tcgetattr(console, &termios) < 0)
    err(EXIT_FAILURE, "tcgetattr");
  termios.c_iflag |= IGNBRK | IUTF8;
  tcsetattr(console, TCSANOW, &termios);

  dup2(console, STDIN_FILENO);
  dup2(console, STDOUT_FILENO);
  dup2(console, STDERR_FILENO);
  if (console != STDIN_FILENO)
    if (console != STDOUT_FILENO)
      if (console != STDERR_FILENO)
        close(console);
}

int supervise(pid_t child, int console) {
  char buffer[PIPE_BUF];
  int signals, slave, status;
  sigset_t mask;
  ssize_t count, length, offset;
  struct pollfd fds[3];

  if (console < 0) {
    if (waitpid(child, &status, 0) < 0)
      err(EXIT_FAILURE, "waitpid");
    return WIFEXITED(status) ? WEXITSTATUS(status) : EXIT_FAILURE;
  }

  sigemptyset(&mask);
  sigaddset(&mask, SIGCHLD);
  sigprocmask(SIG_BLOCK, &mask, NULL);
  if ((signals = signalfd(-1, &mask, 0)) < 0)
    err(EXIT_FAILURE, "signalfd");

  if (waitpid(child, &status, WNOHANG) > 0)
    if (WIFEXITED(status) || WIFSIGNALED(status))
      raise(SIGCHLD);

  savemode();
  atexit(restoremode);
  rawmode();

  slave = open(ptsname(console), O_RDWR);

  fds[0].fd = console;
  fds[0].events = POLLIN;
  fds[1].fd = STDIN_FILENO;
  fds[1].events = POLLIN;
  fds[2].fd = signals;
  fds[2].events = POLLIN;

  while (1) {
    if (poll(fds, 3, -1) < 0)
        if (errno != EAGAIN && errno != EINTR)
          err(EXIT_FAILURE, "poll");

    if (fds[0].revents & POLLIN) {
      if ((length = read(console, buffer, sizeof(buffer))) < 0)
        if (errno != EAGAIN && errno != EINTR)
          err(EXIT_FAILURE, "read");
      for (offset = 0; length > 0; offset += count, length -= count)
        while ((count = write(STDOUT_FILENO, buffer + offset, length)) < 0)
          if (errno != EAGAIN && errno != EINTR)
            err(EXIT_FAILURE, "write");
    }

    if (fds[1].revents & (POLLHUP | POLLIN)) {
      if ((length = read(STDIN_FILENO, buffer, sizeof(buffer))) == 0)
        fds[1].events = 0;
      else if (length < 0 && errno != EAGAIN && errno != EINTR)
        err(EXIT_FAILURE, "read");
      for (offset = 0; length > 0; offset += count, length -= count)
        while ((count = write(console, buffer + offset, length)) < 0)
          if (errno != EAGAIN && errno != EINTR)
            err(EXIT_FAILURE, "write");
    }

    if (fds[2].revents & POLLIN) {
      if (read(signals, buffer, sizeof(buffer)) < 0)
        if (errno != EAGAIN && errno != EINTR)
          err(EXIT_FAILURE, "read");
      if (waitpid(child, &status, WNOHANG) > 0)
        if (WIFEXITED(status) || WIFSIGNALED(status))
          break;
    }
  }

  close(signals);
  close(slave);

  while ((length = read(console, buffer, sizeof(buffer)))) {
    if (length < 0 && errno != EAGAIN && errno != EINTR)
      break;
    for (offset = 0; length > 0; offset += count, length -= count)
      while ((count = write(STDOUT_FILENO, buffer + offset, length)) < 0)
        if (errno != EAGAIN && errno != EINTR)
          err(EXIT_FAILURE, "write");
  }

  return WIFEXITED(status) ? WEXITSTATUS(status) : EXIT_FAILURE;
}


================================================
FILE: contain.c
================================================
#define _GNU_SOURCE
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sysexits.h>
#include <unistd.h>
#include <linux/sched.h>
#include <sys/prctl.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include "contain.h"

static void usage(const char *progname) {
  fprintf(stderr, "\
Usage: %s [OPTIONS] DIR [CMD [ARG]...]\n\
Options:\n\
  -c        disable console emulation in the container\n\
  -g MAP    set the container-to-host GID map\n\
  -i CMD    run a helper child inside the new namespaces\n\
  -n        share the host network unprivileged in the container\n\
  -o CMD    run a helper child outside the new namespaces\n\
  -u MAP    set the container-to-host UID map\n\
GID and UID maps are specified as START:LOWER:COUNT[,START:LOWER:COUNT]...\n\
", progname);
  exit(EX_USAGE);
}

int main(int argc, char **argv) {
  char *gidmap = NULL, *inside = NULL, *outside = NULL, *uidmap = NULL;
  int hostnet = 0, master, option, stdio = 0;
  pid_t child, parent;

  while ((option = getopt(argc, argv, "+:cg:i:no:u:")) > 0)
    switch (option) {
      case 'c':
        stdio++;
        break;
      case 'g':
        gidmap = optarg;
        break;
      case 'i':
        inside = optarg;
        break;
      case 'n':
        hostnet++;
        break;
      case 'o':
        outside = optarg;
        break;
      case 'u':
        uidmap = optarg;
        break;
      default:
        usage(argv[0]);
    }

  if (argc <= optind)
    usage(argv[0]);

  parent = getpid();
  switch (child = fork()) {
    case -1:
      err(EXIT_FAILURE, "fork");
    case 0:
      raise(SIGSTOP);
      if (geteuid() != 0)
        denysetgroups(parent);
      writemap(parent, GID, gidmap);
      writemap(parent, UID, uidmap);

      if (outside) {
        if (setgid(getgid()) < 0 || setuid(getuid()) < 0)
          errx(EXIT_FAILURE, "Failed to drop privileges");
        prctl(PR_SET_DUMPABLE, 1);
        execlp(SHELL, SHELL, "-c", outside, NULL);
        err(EXIT_FAILURE, "exec %s", outside);
      }

      exit(EXIT_SUCCESS);
  }

  if (setgid(getgid()) < 0 || setuid(getuid()) < 0)
    errx(EXIT_FAILURE, "Failed to drop privileges");
  prctl(PR_SET_DUMPABLE, 1);

  if (unshare(CLONE_NEWUSER) < 0)
    errx(EXIT_FAILURE, "Failed to unshare user namespace");

#ifdef CLONE_NEWCGROUP
  if (unshare(CLONE_NEWCGROUP) < 0)
    errx(EXIT_FAILURE, "Failed to unshare cgroup namespace");
#endif

  if (unshare(CLONE_NEWIPC) < 0)
    errx(EXIT_FAILURE, "Failed to unshare IPC namespace");

  if (!hostnet && unshare(CLONE_NEWNET) < 0)
    errx(EXIT_FAILURE, "Failed to unshare network namespace");

  if (unshare(CLONE_NEWNS) < 0)
    errx(EXIT_FAILURE, "Failed to unshare mount namespace");

#ifdef CLONE_NEWTIME
  if (unshare(CLONE_NEWTIME) < 0)
    errx(EXIT_FAILURE, "Failed to unshare time namespace");
#endif

  if (unshare(CLONE_NEWUTS) < 0)
    errx(EXIT_FAILURE, "Failed to unshare UTS namespace");

  waitforstop(child);
  kill(child, SIGCONT);
  waitforexit(child);

  setgid(0);
  setgroups(0, NULL);
  setuid(0);

  master = stdio ? -1 : getconsole();
  createroot(argv[optind], master, inside);

  if (unshare(CLONE_NEWPID) < 0)
    errx(EXIT_FAILURE, "Failed to unshare PID namespace");

  switch (child = fork()) {
    case -1:
      err(EXIT_FAILURE, "fork");
    case 0:
      mountproc();
      if (!hostnet)
        mountsys();
      enterroot();

      if (master >= 0) {
        close(master);
        setconsole("/dev/console");
      }

      clearenv();
      putenv("container=contain");

      if (argv[optind + 1])
        execv(argv[optind + 1], argv + optind + 1);
      else
        execl(SHELL, SHELL, NULL);
      err(EXIT_FAILURE, "exec");
  }

  return supervise(child, master);
}


================================================
FILE: contain.h
================================================
#ifndef CONTAIN_H
#define CONTAIN_H

#define GID 0
#define UID 1
#define INVALID ((unsigned) -1)
#define SHELL "/bin/sh"

#define getid(type) ((unsigned) ((type) == GID ? getgid() : getuid()))
#define idfile(type) ((type) == GID ? "gid_map" : "uid_map")
#define idname(type) ((type) == GID ? "GID" : "UID")
#define subpath(type) ((type) == GID ? "/etc/subgid" : "/etc/subuid")

char *append(char **destination, const char *format, ...);
void createroot(char *src, int console, char *helper);
void denysetgroups(pid_t pid);
void enterroot(void);
int getconsole(void);
void mountproc(void);
void mountsys(void);
void seal(char **argv, char **envp);
void setconsole(char *name);
char *string(const char *format, ...);
int supervise(pid_t child, int console);
char *tmpdir(void);
void waitforstop(pid_t child);
void waitforexit(pid_t child);
void writemap(pid_t pid, int type, char *map);

#endif


================================================
FILE: inject.c
================================================
#define _GNU_SOURCE
#include <dirent.h>
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <grp.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include "contain.h"

static int getparent(pid_t child) {
  char *end, *line = NULL, *path, *start;
  pid_t parent = -1;
  size_t size;
  FILE *file;

  path = string("/proc/%u/stat", child);
  file = fopen(path, "r");
  free(path);

  if (file && getline(&line, &size, file) >= 0)
    /* "PID (NAME) S PPID ...", so PPID begins 4 chars after the last ')' */
    if ((start = strrchr(line, ')')) && strlen(start) >= 4) {
      parent = strtol(start + 4, &end, 10);
      if (end == start || *end != ' ')
        parent = -1;
    }

  if (file)
    fclose(file);
  if (line)
    free(line);

  return parent;
}

static void join(pid_t pid, char *type) {
  char *path;
  int fd;

  path = string("/proc/%u/ns/%s", pid, type);

  if ((fd = open(path, O_RDONLY)) >= 0) {
    if (syscall(__NR_setns, fd, 0) < 0 && strcmp(type, "user") == 0)
      errx(EXIT_FAILURE, "Failed to join user namespace");
    close(fd);
  } else if (errno != ENOENT) {
    errx(EXIT_FAILURE, "PID %u does not belong to you", pid);
  } else if (strcmp(type, "user") == 0) {
    errx(EXIT_FAILURE, "PID %u not found or user namespace unavailable", pid);
  }

  free(path);
}

static void usage(const char *progname) {
  fprintf(stderr, "Usage: %s PID [CMD [ARG]...]\n", progname);
  exit(64);
}

int main(int argc, char **argv, char **envp) {
  char *end, *item = NULL, *path;
  pid_t child = -1, parent, pid;
  size_t size;
  struct dirent *entry;
  DIR *dir;
  FILE *file;

  seal(argv, envp);
  if (argc < 2)
    usage(argv[0]);

  parent = strtol(argv[1], &end, 10);
  if (end == argv[1] || *end)
    usage(argv[0]);

  if (geteuid() != getuid())
    errx(EXIT_FAILURE, "setuid installation is unsafe");
  else if (getegid() != getgid())
    errx(EXIT_FAILURE, "setgid installation is unsafe");

  join(parent, "user");
  setgid(0);
  setgroups(0, NULL);
  setuid(0);

  if (!(dir = opendir("/proc")))
    errx(EXIT_FAILURE, "Failed to list processes");
  while (child < 0 && (entry = readdir(dir))) {
    pid = strtol(entry->d_name, &end, 10);
    if (end == entry->d_name || *end)
      continue;
    if (getparent(pid) == parent) {
      path = string("/proc/%u/environ", pid);
      if ((file = fopen(path, "r"))) {
        while (getdelim(&item, &size, '\0', file) >= 0)
          if (strcmp(item, "container=contain") == 0)
            child = pid;
        fclose(file);
      }
      free(path);
    }
  }
  closedir(dir);
  if (item)
    free(item);

  if (child < 0)
    errx(EXIT_FAILURE, "PID %u is not a container supervisor", parent);

  join(child, "cgroup");
  join(child, "ipc");
  join(child, "net");
  join(child, "pid");
  join(child, "time");
  join(child, "uts");
  join(child, "mnt");

  if (chdir("/") < 0)
    errx(EXIT_FAILURE, "Failed to enter container root directory");

  switch (child = fork()) {
    case -1:
      err(EXIT_FAILURE, "fork");
    case 0:
      if (argv[2])
        execvp(argv[2], argv + 2);
      else if (getenv("SHELL"))
        execl(getenv("SHELL"), getenv("SHELL"), NULL);
      else
        execl(SHELL, SHELL, NULL);
      err(EXIT_FAILURE, "exec");
  }

  waitforexit(child);
  return EXIT_SUCCESS;
}


================================================
FILE: map.c
================================================
#define _GNU_SOURCE
#include <err.h>
#include <errno.h>
#include <grp.h>
#include <fcntl.h>
#include <pwd.h>
#include <sched.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "contain.h"

void denysetgroups(pid_t pid) {
  char *path, *text = "deny";
  int fd;

  path = string("/proc/%d/setgroups", pid);
  if ((fd = open(path, O_WRONLY)) < 0)
    errx(EXIT_FAILURE, "Failed to disable setgroups() in container");
  else if (write(fd, text, strlen(text)) != (ssize_t) strlen(text))
    errx(EXIT_FAILURE, "Failed to disable setgroups() in container");
  close(fd);
  free(path);
}

static char *getmap(pid_t pid, int type) {
  char *line = NULL, *result = NULL, *path;
  size_t size;
  unsigned count, first, lower;
  FILE *file;

  if (pid == -1)
    path = string("/proc/self/%s", idfile(type));
  else
    path = string("/proc/%d/%s", pid, idfile(type));
  if (!(file = fopen(path, "r")))
    errx(EXIT_FAILURE, "Cannot read %s", path);

  while (getline(&line, &size, file) >= 0) {
    if (sscanf(line, " %u %u %u", &first, &lower, &count) != 3)
      errx(EXIT_FAILURE, "Invalid map data in %s", path);
    append(&result, "%s%u:%u:%u", result ? "," : "", first, lower, count);
  }

  if (!result)
    errx(EXIT_FAILURE, "Invalid map data in %s", path);

  fclose(file);
  free(line);
  free(path);
  return result;
}

static char *mapitem(char *map, unsigned *first, unsigned *lower,
    unsigned *count) {
  ssize_t skip;

  while (map && *map && strchr(",;", *map))
    map++;
  if (map == NULL || *map == '\0')
    return NULL;
  if (sscanf(map, "%u:%u:%u%zn", first, lower, count, &skip) < 3)
    errx(EXIT_FAILURE, "Invalid ID map '%s'", map);
  return map + skip;
}

static char *rangeitem(char *range, unsigned *start, unsigned *length) {
  ssize_t skip;

  while (range && *range && strchr(",;", *range))
    range++;
  if (range == NULL || *range == '\0')
    return NULL;
  if (sscanf(range, "%u:%u%zn", start, length, &skip) < 2)
    errx(EXIT_FAILURE, "Invalid ID range '%s'", range);
  return range + skip;
}

static char *readranges(int type) {
  char *line = NULL, *entry, *range, *user;
  size_t end, size;
  struct passwd *passwd;
  uid_t uid;
  unsigned int length, start;
  FILE *file;

  range = string("%u:1", getid(type));
  if (!(file = fopen(subpath(type), "r")))
    return range;

  uid = getuid();
  user = getenv("USER");
  user = user ? user : getenv("LOGNAME");
  user = user ? user : getlogin();
  if (!user || !(passwd = getpwnam(user)) || passwd->pw_uid != uid) {
    if (!(passwd = getpwuid(uid)))
      errx(EXIT_FAILURE, "Failed to validate your username");
    user = passwd->pw_name;
  }
  endpwent();

  while (getline(&line, &size, file) >= 0) {
    if (strtol(line, &entry, 10) != uid || entry == line) {
      if (strncmp(line, user, strlen(user)))
        continue;
      entry = line + strlen(user);
    }
    if (sscanf(entry, ":%u:%u%zn", &start, &length, &end) < 2)
      continue;
    if (strchr(":\n", entry[end + 1]))
      append(&range, ",%u:%u", start, length);
  }

  free(line);
  fclose(file);
  return range;
}

static char *rootdefault(int type) {
  char *cursor, *map, *result;
  unsigned count, first, last = INVALID, lower;

  cursor = map = getmap(-1, type);
  while ((cursor = mapitem(cursor, &first, &lower, &count)))
    if (last == INVALID || last < first + count - 1)
      last = first + count - 1;
  result = string("0:%u:1", last);

  cursor = map;
  while ((cursor = mapitem(cursor, &first, &lower, &count))) {
    if (first == 0) {
      if (count == 1 && first >= last)
        errx(EXIT_FAILURE, "No unprivileged %s available\n", idname(type));
      first++, lower++, count--;
    }

    if (last <= first + count - 1 && count > 0)
      count--;

    if (count > 0)
      append(&result, "%s%u:%u:%u", result ? "," : "", first, first, count);
  }

  free(map);
  return result;
}

static char *userdefault(int type) {
  char *cursor, *map, *range, *result = NULL;
  unsigned count, first, index = 0, length, lower, start;

  if (geteuid() != 0)
    return string("0:%u:1", getid(type));

  map = getmap(-1, type);
  range = readranges(type);

  while ((range = rangeitem(range, &start, &length))) {
    cursor = map;
    while ((cursor = mapitem(cursor, &first, &lower, &count))) {
      if (start + length <= first || first + count <= start)
        continue;
      if (first + count < start + length)
        length = start - first + count;
      if (start < first) {
        index += first - start;
        length -= first - start;
        start = first;
      }
      append(&result, "%s%u:%u:%u", result ? "," : "", index, start, length);
      index += length;
    }
  }

  free(map);
  free(range);
  return result;
}

static void validate(char *range, unsigned first, unsigned count) {
  unsigned length, start;

  while ((range = rangeitem(range, &start, &length)))
    if (first < start + length && start < first + count) {
      if (first < start)
        validate(range, first, start - first);
      if (first + count > start + length)
        validate(range, start + length, first + count - start - length);
      return;
    }
  errx(EXIT_FAILURE, "Cannot map onto IDs that are not delegated to you");
}

static void verifymap(char *map, char *range) {
  unsigned count, first, lower;

  while ((map = mapitem(map, &first, &lower, &count)))
    validate(range, lower, count);
}

void writemap(pid_t pid, int type, char *map) {
  char *path, *range, *text = NULL;
  int fd;
  unsigned count, first, lower;

  if (!map) {
    map = (getuid() == 0 ? rootdefault : userdefault)(type);
  } else if (getuid() != 0) {
    range = readranges(type);
    verifymap(map, range);
    free(range);
  }

  while ((map = mapitem(map, &first, &lower, &count)))
    append(&text, "%u %u %u\n", first, lower, count);

  path = string("/proc/%d/%s", pid, idfile(type));
  if ((fd = open(path, O_WRONLY)) < 0)
    errx(EXIT_FAILURE, "Failed to set container %s map", idname(type));
  else if (write(fd, text, strlen(text)) != (ssize_t) strlen(text))
    errx(EXIT_FAILURE, "Failed to set container %s map", idname(type));

  close(fd);
  free(path);
  free(text);
}


================================================
FILE: mount.c
================================================
#define _GNU_SOURCE
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mount.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include "contain.h"

static char *root;

static void bindnode(char *src, char *dst) {
  int fd;

  if ((fd = open(dst, O_WRONLY | O_CREAT, 0600)) >= 0)
    close(fd);
  if (mount(src, dst, NULL, MS_BIND, NULL) < 0)
    errx(EXIT_FAILURE, "Failed to bind %s into new /dev filesystem", src);
}

static void cleanup(void) {
  if (root) {
    umount2(root, MNT_DETACH);
    rmdir(root);
  }
}

void createroot(char *src, int console, char *helper) {
  mode_t mask;
  pid_t child;

  root = tmpdir();
  atexit(cleanup);

  if (mount(src, root, NULL, MS_BIND | MS_REC, NULL) < 0)
    errx(EXIT_FAILURE, "Failed to bind new root filesystem");
  else if (chdir(root) < 0)
    errx(EXIT_FAILURE, "Failed to enter new root filesystem");

  mask = umask(0);
  mkdir("dev" , 0755);
  if (mount("tmpfs", "dev", "tmpfs", 0, "mode=0755") < 0)
    errx(EXIT_FAILURE, "Failed to mount /dev tmpfs in new root filesystem");

  mkdir("dev/pts", 0755);
  if (mount("devpts", "dev/pts", "devpts", 0, "newinstance,ptmxmode=666") < 0)
    errx(EXIT_FAILURE, "Failed to mount /dev/pts in new root filesystem");

  mkdir("dev/tmp", 0755);
  umask(mask);

  if (console >= 0)
    bindnode(ptsname(console), "dev/console");
  bindnode("/dev/full", "dev/full");
  bindnode("/dev/null", "dev/null");
  bindnode("/dev/random", "dev/random");
  bindnode("/dev/tty", "dev/tty");
  bindnode("/dev/urandom", "dev/urandom");
  bindnode("/dev/zero", "dev/zero");
  symlink("pts/ptmx", "dev/ptmx");

  if (helper)
    switch (child = fork()) {
      case -1:
        err(EXIT_FAILURE, "fork");
      case 0:
        execlp(SHELL, SHELL, "-c", helper, NULL);
        err(EXIT_FAILURE, "exec %s", helper);
      default:
        waitforexit(child);
    }
}

void enterroot(void) {
  if (syscall(__NR_pivot_root, ".", "dev/tmp") < 0)
    errx(EXIT_FAILURE, "Failed to pivot into new root filesystem");

  if (chdir("/dev/tmp") >= 0) {
    while (*root == '/')
      root++;
    rmdir(root);
  }

  root = NULL;

  if (chdir("/") < 0 || umount2("/dev/tmp", MNT_DETACH) < 0)
    errx(EXIT_FAILURE, "Failed to detach old root filesystem");
  else
    rmdir("/dev/tmp");
}

void mountproc(void) {
  mode_t mask;

  mask = umask(0);
  mkdir("proc" , 0755);
  umask(mask);

  if (mount("proc", "proc", "proc", 0, NULL) < 0)
    errx(EXIT_FAILURE, "Failed to mount /proc in new root filesystem");
}

void mountsys(void) {
  mode_t mask;

  mask = umask(0);
  mkdir("sys" , 0755);
  umask(mask);

  if (mount("sysfs", "sys", "sysfs", 0, NULL) < 0)
    errx(EXIT_FAILURE, "Failed to mount /sys in new root filesystem");
  mount("cgroup2", "sys/fs/cgroup", "cgroup2", 0, NULL);
}


================================================
FILE: pseudo.c
================================================
#define _GNU_SOURCE
#include <err.h>
#include <errno.h>
#include <grp.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <sysexits.h>
#include <unistd.h>
#include <sys/prctl.h>
#include "contain.h"

static void usage(const char *progname) {
  fprintf(stderr, "\
Usage: %s [OPTIONS] [CMD [ARG]...]\n\
Options:\n\
  -g MAP    set the user namespace GID map\n\
  -u MAP    set the user namespace UID map\n\
GID and UID maps are specified as START:LOWER:COUNT[,START:LOWER:COUNT]...\n\
", progname);
  exit(EX_USAGE);
}

int main(int argc, char **argv) {
  char *gidmap = NULL, *uidmap = NULL;
  int option;
  pid_t child, parent;

  while ((option = getopt(argc, argv, "+:g:u:")) > 0)
    switch (option) {
      case 'g':
        gidmap = optarg;
        break;
      case 'u':
        uidmap = optarg;
        break;
      default:
        usage(argv[0]);
    }

  parent = getpid();
  switch (child = fork()) {
    case -1:
      err(EXIT_FAILURE, "fork");
    case 0:
      raise(SIGSTOP);
      if (geteuid() != 0)
        denysetgroups(parent);
      writemap(parent, GID, gidmap);
      writemap(parent, UID, uidmap);
      exit(0);
  }

  if (setgid(getgid()) < 0 || setuid(getuid()) < 0)
    errx(EXIT_FAILURE, "Failed to drop privileges");
  prctl(PR_SET_DUMPABLE, 1);

  if (unshare(CLONE_NEWUSER) < 0)
    errx(EXIT_FAILURE, "Failed to unshare user namespace");

  waitforstop(child);
  kill(child, SIGCONT);
  waitforexit(child);

  setgid(0);
  setgroups(0, NULL);
  setuid(0);

  if (argv[optind])
    execvp(argv[optind], argv + optind);
  else if (getenv("SHELL"))
    execl(getenv("SHELL"), getenv("SHELL"), NULL);
  else
    execl(SHELL, SHELL, NULL);

  err(EXIT_FAILURE, "exec");
  return EXIT_FAILURE;
}


================================================
FILE: util.c
================================================
#define _GNU_SOURCE
#include <err.h>
#include <errno.h>
#include <fcntl.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/sendfile.h>
#include <sys/types.h>
#include <sys/wait.h>
#include "contain.h"

char *append(char **destination, const char *format, ...) {
  char *extra, *result;
  va_list args;

  va_start(args, format);
  if (vasprintf(&extra, format, args) < 0)
    err(EXIT_FAILURE, "asprintf");
  va_end(args);

  if (*destination == NULL) {
    *destination = extra;
    return extra;
  }

  if (asprintf(&result, "%s%s", *destination, extra) < 0)
      err(EXIT_FAILURE, "asprintf");
  free(*destination);
  free(extra);
  *destination = result;
  return result;
}

void seal(char **argv, char **envp) {
  const int seals = F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE;
  int dst, src;
  ssize_t length;

  if ((src = open("/proc/self/exe", O_RDONLY)) < 0)
    err(EXIT_FAILURE, "open /proc/self/exe");
  if (fcntl(src, F_GET_SEALS) == seals) {
    close(src);
    return;
  }

  dst = memfd_create("/proc/self/exe", MFD_CLOEXEC | MFD_ALLOW_SEALING);
  if (dst < 0)
    err(EXIT_FAILURE, "memfd_create");

  while (length = sendfile(dst, src, NULL, BUFSIZ), length != 0)
    if (length < 0 && errno != EAGAIN && errno != EINTR)
      err(EXIT_FAILURE, "sendfile");
  close(src);

  if (fcntl(dst, F_ADD_SEALS, seals) < 0)
    err(EXIT_FAILURE, "fcntl F_ADD_SEALS");
  fexecve(dst, argv, envp);
  err(EXIT_FAILURE, "fexecve");
}

char *string(const char *format, ...) {
  char *result;
  va_list args;

  va_start(args, format);
  if (vasprintf(&result, format, args) < 0)
    err(EXIT_FAILURE, "asprintf");
  va_end(args);
  return result;
}

char *tmpdir(void) {
  char *dir;

  if (!(dir = strdup("/tmp/XXXXXX")))
    err(EXIT_FAILURE, "strdup");
  else if (!mkdtemp(dir))
    errx(EXIT_FAILURE, "Failed to create temporary directory");
  return dir;
}

void waitforexit(pid_t child) {
  int status;

  if (waitpid(child, &status, 0) < 0)
    err(EXIT_FAILURE, "waitpid");
  else if (WEXITSTATUS(status) != EXIT_SUCCESS)
    exit(WEXITSTATUS(status));
}

void waitforstop(pid_t child) {
  int status;

  if (waitpid(child, &status, WUNTRACED) < 0)
    err(EXIT_FAILURE, "waitpid");
  if (!WIFSTOPPED(status))
    exit(WEXITSTATUS(status));
}