Repository: newsnowlabs/runcvm Branch: main Commit: 1ec8d38f6596 Files: 46 Total size: 206.9 KB Directory structure: gitextract_f_pznfpt/ ├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md ├── build/ │ └── build.sh ├── build-utils/ │ ├── entrypoint-install.sh │ └── make-bundelf-bundle.sh ├── kernels/ │ └── oraclelinux/ │ ├── 95virtiofs/ │ │ ├── module-setup.sh │ │ ├── mount-virtiofs.sh │ │ └── parse-virtiofs.sh │ └── addvirtiofs.conf ├── patches/ │ ├── dnsmasq/ │ │ └── remove-passwd-requirement.patch │ ├── dropbear/ │ │ └── runcvm.patch │ ├── mkinitfs/ │ │ └── nlplug-findfs.patch │ └── seabios/ │ └── qemu-fw-cfg-fix.patch ├── qemu-exit/ │ └── qemu-exit.c ├── runcvm-init/ │ ├── VERSION.h │ └── dumb-init.c ├── runcvm-scripts/ │ ├── functions/ │ │ └── cgroupfs │ ├── runcvm-ctr-defaults │ ├── runcvm-ctr-entrypoint │ ├── runcvm-ctr-exec │ ├── runcvm-ctr-exit │ ├── runcvm-ctr-qemu │ ├── runcvm-ctr-qemu-ifdown │ ├── runcvm-ctr-qemu-ifup │ ├── runcvm-ctr-qemu-poweroff │ ├── runcvm-ctr-shutdown │ ├── runcvm-ctr-virtiofsd │ ├── runcvm-install-runtime.sh │ ├── runcvm-ip-functions │ ├── runcvm-runtime │ ├── runcvm-vm-exec │ ├── runcvm-vm-init │ ├── runcvm-vm-qemu-ga │ ├── runcvm-vm-start │ └── runcvm-vm-start-wrapper └── tests/ ├── 00-http-docker-swarm/ │ ├── node/ │ │ ├── Dockerfile │ │ └── docker.sh │ └── test ├── 01-mariadb/ │ └── test ├── 02-user-workdir/ │ └── test ├── 03-env/ │ └── test ├── framework.sh └── run ================================================ FILE CONTENTS ================================================ ================================================ FILE: .dockerignore ================================================ # Ignore files matching the following patterns within docker build **/*~ depot.json ================================================ FILE: .gitignore ================================================ .#* *~ *.bak *.o *.version *.orig *.tdy TAGS \#*\# .packlist perllocal.pod .c9 .Trash-1000 .vscode depot.json ================================================ FILE: Dockerfile ================================================ # syntax=docker/dockerfile:1.3-labs # Alpine version to build with ARG ALPINE_VERSION=3.19 # --- BUILD STAGE --- # Build base alpine-sdk image for later build stages FROM alpine:$ALPINE_VERSION as alpine-sdk RUN apk update && apk add --no-cache alpine-sdk coreutils && \ abuild-keygen -an && \ # Copy the public keys to the system keys cp -a /root/.abuild/*.pub /etc/apk/keys && \ git clone --depth 1 --single-branch --filter=blob:none --sparse https://gitlab.alpinelinux.org/alpine/aports.git ~/aports && \ cd ~/aports/ && \ git sparse-checkout set main/seabios main/ # --- BUILD STAGE --- # Build patched SeaBIOS packages # to allow disabling of BIOS output by QEMU # (without triggering QEMU warnings) FROM alpine-sdk as alpine-seabios ADD patches/seabios/qemu-fw-cfg-fix.patch /root/aports/main/seabios/0003-qemu-fw-cfg-fix.patch RUN <>APKBUILD echo 'source="${source}0003-qemu-fw-cfg-fix.patch"' >>APKBUILD abuild -rFf EOF # --- BUILD STAGE --- # Build patched dnsmasq # that does not require /etc/passwd file to run # (needed for images such as hello-world) FROM alpine-sdk as alpine-dnsmasq ADD patches/dnsmasq/remove-passwd-requirement.patch /root/aports/main/dnsmasq/remove-passwd-requirement.patch RUN <>APKBUILD echo 'source="${source}remove-passwd-requirement.patch"' >>APKBUILD abuild -rFf EOF # --- BUILD STAGE --- # Build patched dropbear with epka plugin # that does not require /etc/passwd or PAM to run FROM alpine-sdk as alpine-dropbear ADD patches/dropbear/runcvm.patch /root/aports/main/dropbear/runcvm.patch RUN <>APKBUILD echo 'source="${source}runcvm.patch"' >>APKBUILD abuild -rFf cd /root git clone https://github.com/fabriziobertocci/dropbear-epka.git cd dropbear-epka apk add --no-cache automake autoconf libtool libtoolize --force aclocal autoheader || true automake --force-missing --add-missing autoconf ./configure make install EOF # --- BUILD STAGE --- # Build patched mkinitfs/nlplug-findfs # with shorter timeout for speedier boot (saving ~4s) FROM alpine-sdk as alpine-mkinitfs ADD patches/mkinitfs/nlplug-findfs.patch /root/aports/main/mkinitfs/nlplug-findfs.patch RUN <>APKBUILD echo 'source="${source} nlplug-findfs.patch"' >>APKBUILD abuild -rFf EOF # --- BUILD STAGE --- # Build dist-independent dynamic binaries and libraries FROM alpine:$ALPINE_VERSION as binaries RUN apk update && \ apk add --no-cache file bash qemu-system-x86_64 qemu-virtiofsd qemu-ui-curses qemu-guest-agent \ qemu-hw-display-virtio-vga \ ovmf \ jq iproute2 netcat-openbsd e2fsprogs blkid util-linux \ s6 dnsmasq iptables nftables \ ncurses coreutils \ patchelf # Install patched SeaBIOS COPY --from=alpine-seabios /root/packages/main/x86_64 /tmp/seabios/ RUN apk add --allow-untrusted /tmp/seabios/*.apk && cp -a /usr/share/seabios/bios*.bin /usr/share/qemu/ # Install patched dnsmasq COPY --from=alpine-dnsmasq /root/packages/main/x86_64 /tmp/dnsmasq/ RUN apk add --allow-untrusted /tmp/dnsmasq/dnsmasq-2*.apk /tmp/dnsmasq/dnsmasq-common*.apk # Install patched dropbear COPY --from=alpine-dropbear /root/packages/main/x86_64 /usr/local/lib/libepka_file.so /tmp/dropbear/ RUN apk add --allow-untrusted /tmp/dropbear/dropbear-ssh*.apk /tmp/dropbear/dropbear-dbclient*.apk /tmp/dropbear/dropbear-2*.apk # Patch the binaries and set up symlinks COPY build-utils/make-bundelf-bundle.sh /usr/local/bin/make-bundelf-bundle.sh ENV BUNDELF_BINARIES="busybox bash jq ip nc mke2fs blkid findmnt dnsmasq xtables-legacy-multi nft xtables-nft-multi nft mount s6-applyuidgid qemu-system-x86_64 qemu-ga /usr/lib/qemu/virtiofsd tput coreutils getent dropbear dbclient dropbearkey" ENV BUNDELF_EXTRA_LIBS="/usr/lib/xtables /usr/libexec/coreutils /tmp/dropbear/libepka_file.so /usr/lib/qemu/*.so" ENV BUNDELF_EXTRA_SYSTEM_LIB_PATHS="/usr/lib/xtables" ENV BUNDELF_CODE_PATH="/opt/runcvm" ENV BUNDELF_EXEC_PATH="/.runcvm/guest" RUN /usr/local/bin/make-bundelf-bundle.sh --bundle && \ mkdir -p $BUNDELF_CODE_PATH/bin && \ cd $BUNDELF_CODE_PATH/bin && \ for cmd in \ awk base64 cat chgrp chmod cut grep head hostname init ln ls \ mkdir poweroff ps rm rmdir route sh sysctl tr touch; \ do \ ln -s busybox $cmd; \ done && \ mkdir -p $BUNDELF_CODE_PATH/usr/share && \ cp -a /usr/share/qemu $BUNDELF_CODE_PATH/usr/share && \ cp -a /etc/terminfo $BUNDELF_CODE_PATH/usr/share && \ cp -a /usr/share/OVMF $BUNDELF_CODE_PATH/usr/share && \ # Remove setuid/setgid bits from any/all binaries chmod -R -s $BUNDELF_CODE_PATH/ # --- BUILD STAGE --- # Build static runcvm-init FROM alpine:$ALPINE_VERSION as runcvm-init RUN apk update && \ apk add --no-cache gcc musl-dev ADD runcvm-init /root/runcvm-init RUN cd /root/runcvm-init && cc -o /root/runcvm-init/runcvm-init -std=gnu99 -static -s -Wall -Werror -O3 dumb-init.c # --- BUILD STAGE --- # Build static qemu-exit FROM alpine:$ALPINE_VERSION as qemu-exit RUN apk update && \ apk add --no-cache gcc musl-dev ADD qemu-exit /root/qemu-exit RUN cd /root/qemu-exit && cc -o /root/qemu-exit/qemu-exit -std=gnu99 -static -s -Wall -Werror -O3 qemu-exit.c # --- BUILD STAGE --- # Build alpine kernel and initramfs with virtiofs module FROM alpine:$ALPINE_VERSION as alpine-kernel # Install patched mkinitfs COPY --from=alpine-mkinitfs /root/packages/main/x86_64 /tmp/mkinitfs/ RUN apk add --allow-untrusted /tmp/mkinitfs/*.apk RUN apk add --no-cache linux-virt RUN echo 'kernel/fs/fuse/virtiofs*' >>/etc/mkinitfs/features.d/virtio.modules && \ sed -ri 's/\b(ata|nvme|raid|scsi|usb|cdrom|kms|mmc)\b//g; s/[ ]+/ /g' /etc/mkinitfs/mkinitfs.conf && \ sed -ri 's/(nlplug-findfs)/\1 --timeout=1000/' /usr/share/mkinitfs/initramfs-init && \ mkinitfs $(basename $(ls -d /lib/modules/*)) RUN BASENAME=$(basename $(ls -d /lib/modules/*)) && \ mkdir -p /opt/runcvm/kernels/alpine/$BASENAME && \ cp -a /boot/vmlinuz-virt /opt/runcvm/kernels/alpine/$BASENAME/vmlinuz && \ cp -a /boot/initramfs-virt /opt/runcvm/kernels/alpine/$BASENAME/initrd && \ cp -a /lib/modules/ /opt/runcvm/kernels/alpine/$BASENAME/ && \ cp -a /boot/config-virt /opt/runcvm/kernels/alpine/$BASENAME/modules/$BASENAME/config && \ chmod -R u+rwX,g+rX,o+rX /opt/runcvm/kernels/alpine FROM alpine-kernel as openwrt-kernel RUN mkdir -p /opt/runcvm/kernels/openwrt/$(basename $(ls -d /lib/modules/*))/modules/$(basename $(ls -d /lib/modules/*)) && \ cd /opt/runcvm/kernels/openwrt/$(basename $(ls -d /lib/modules/*)) && \ cp -a /boot/vmlinuz-virt vmlinuz && \ cp -a /boot/initramfs-virt initrd && \ find /lib/modules/ -type f -name '*.ko*' -exec cp -a {} modules/$(basename $(ls -d /lib/modules/*)) \; && \ gunzip modules/$(basename $(ls -d /lib/modules/*))/*.gz && \ chmod -R u+rwX,g+rX,o+rX /opt/runcvm/kernels/openwrt # --- BUILD STAGE --- # Build Debian bookworm kernel and initramfs with virtiofs module FROM amd64/debian:bookworm as debian-kernel ARG DEBIAN_FRONTEND=noninteractive RUN apt update && apt install -y linux-image-amd64:amd64 && \ echo 'virtiofs' >>/etc/initramfs-tools/modules && \ echo 'virtio_console' >>/etc/initramfs-tools/modules && \ echo "RESUME=none" >/etc/initramfs-tools/conf.d/resume && \ update-initramfs -u RUN BASENAME=$(basename $(ls -d /lib/modules/*)) && \ mkdir -p /opt/runcvm/kernels/debian/$BASENAME && \ cp -aL /vmlinuz /opt/runcvm/kernels/debian/$BASENAME/vmlinuz && \ cp -aL /initrd.img /opt/runcvm/kernels/debian/$BASENAME/initrd && \ cp -a /lib/modules/ /opt/runcvm/kernels/debian/$BASENAME/ && \ cp -a /boot/config-$BASENAME /opt/runcvm/kernels/debian/$BASENAME/modules/$BASENAME/config && \ chmod -R u+rwX,g+rX,o+rX /opt/runcvm/kernels/debian # --- BUILD STAGE --- # Build Ubuntu bullseye kernel and initramfs with virtiofs module FROM amd64/ubuntu:jammy as ubuntu-kernel ARG DEBIAN_FRONTEND=noninteractive RUN apt update && apt install -y linux-generic:amd64 && \ echo 'virtiofs' >>/etc/initramfs-tools/modules && \ echo 'virtio_console' >>/etc/initramfs-tools/modules && \ echo "RESUME=none" >/etc/initramfs-tools/conf.d/resume && \ update-initramfs -u RUN BASENAME=$(basename $(ls -d /lib/modules/*)) && \ mkdir -p /opt/runcvm/kernels/ubuntu/$BASENAME && \ cp -aL /boot/vmlinuz /opt/runcvm/kernels/ubuntu/$BASENAME/vmlinuz && \ cp -aL /boot/initrd.img /opt/runcvm/kernels/ubuntu/$BASENAME/initrd && \ cp -a /lib/modules/ /opt/runcvm/kernels/ubuntu/$BASENAME/ && \ cp -a /boot/config-$BASENAME /opt/runcvm/kernels/ubuntu/$BASENAME/modules/$BASENAME/config && \ chmod -R u+rwX,g+rX,o+rX /opt/runcvm/kernels/ubuntu # --- BUILD STAGE --- # Build Oracle Linux kernel and initramfs with virtiofs module FROM oraclelinux:9 as oracle-kernel RUN dnf install -y kernel ADD ./kernels/oraclelinux/addvirtiofs.conf /etc/dracut.conf.d/addvirtiofs.conf ADD ./kernels/oraclelinux/95virtiofs /usr/lib/dracut/modules.d/95virtiofs RUN dracut --force --kver $(basename /lib/modules/*) --kmoddir /lib/modules/* RUN mkdir -p /opt/runcvm/kernels/ol/$(basename $(ls -d /lib/modules/*)) && \ mv /lib/modules/*/vmlinuz /opt/runcvm/kernels/ol/$(basename $(ls -d /lib/modules/*))/vmlinuz && \ cp -aL /boot/initramfs* /opt/runcvm/kernels/ol/$(basename $(ls -d /lib/modules/*))/initrd && \ cp -a /lib/modules/ /opt/runcvm/kernels/ol/$(basename $(ls -d /lib/modules/*))/ && \ chmod -R u+rwX,g+rX,o+rX /opt/runcvm/kernels/ol # --- BUILD STAGE --- # Build RunCVM installer FROM alpine:$ALPINE_VERSION as installer COPY --from=binaries /opt/runcvm /opt/runcvm COPY --from=runcvm-init /root/runcvm-init/runcvm-init /opt/runcvm/sbin/ COPY --from=qemu-exit /root/qemu-exit/qemu-exit /opt/runcvm/sbin/ RUN apk update && apk add --no-cache rsync ADD runcvm-scripts /opt/runcvm/scripts/ ADD build-utils/entrypoint-install.sh / ENTRYPOINT ["/entrypoint-install.sh"] # Install needed kernels. # Comment out any kernels that are unneeded. COPY --from=alpine-kernel /opt/runcvm/kernels/alpine /opt/runcvm/kernels/alpine COPY --from=debian-kernel /opt/runcvm/kernels/debian /opt/runcvm/kernels/debian COPY --from=openwrt-kernel /opt/runcvm/kernels/openwrt /opt/runcvm/kernels/openwrt COPY --from=ubuntu-kernel /opt/runcvm/kernels/ubuntu /opt/runcvm/kernels/ubuntu COPY --from=oracle-kernel /opt/runcvm/kernels/ol /opt/runcvm/kernels/ol # Add 'latest' symlinks for available kernels RUN for d in /opt/runcvm/kernels/*; do \ cd "$d" && \ tgt="$(ls -d */ 2>/dev/null | sed 's:/$::' | grep -v '^latest$' | sort -Vr | head -n 1)"; \ [ -n "$tgt" ] && ln -sfn "$tgt" latest; \ done ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # RunCVM Container Runtime ## Introduction RunCVM (Run Container Virtual Machine) is an experimental open-source Docker container runtime for Linux, created by Struan Bartlett at NewsNow Labs, that makes launching standard containerised workloads and system workloads (e.g. Systemd, Docker, even OpenWrt) in VMs as easy as launching a container.

## Quick start Install: ```sh curl -s -o - https://raw.githubusercontent.com/newsnowlabs/runcvm/main/runcvm-scripts/runcvm-install-runtime.sh | sudo sh ``` Now launch an nginx VM listening on port 8080: ```console docker run --runtime=runcvm --name nginx1 --rm -p 8080:80 nginx ``` Launch a MariaDB VM, with 2 cpus and 2G memory, listening on port 3306: ```console docker run --runtime=runcvm --name mariadb1 --rm -p 3306:3306 --cpus 2 --memory 2G --env=MARIADB_ALLOW_EMPTY_ROOT_PASSWORD=1 mariadb ``` Launch a vanilla ubuntu VM, with interactive terminal: ```console docker run --runtime=runcvm --name ubuntu1 --rm -it ubuntu ``` Gain another interactive console on `ubuntu1`: ```console docker exec -it ubuntu1 bash ``` Launch with VNC: ```console docker run --runtime=runcvm --name ubuntu2 --env=RUNCVM_DISPLAY_MODE=vnc -p 5900:5900 ubuntu ``` Launch a VM with 1G memory and a 1G ext4-formatted backing file mounted at `/var/lib/docker` and stored in the underlying container's filesystem: ```console docker run -it --runtime=runcvm --memory=1G --env=RUNCVM_DISKS=/disks/docker,/var/lib/docker,ext4,1G ``` Launch a VM with 2G memory and a 5G ext4-formatted backing file mounted at `/var/lib/docker` and stored in a dedicated Docker volume on the host: ```console docker run -it --runtime=runcvm --memory=2G --mount=type=volume,src=runcvm-disks,dst=/disks --env=RUNCVM_DISKS=/disks/docker,/var/lib/docker,ext4,5G ``` Launch a 3-node Docker Swarm on a network with 9000 MTU and, on the swarm, an http global service: ```console git clone https://github.com/newsnowlabs/runcvm.git && \ cd runcvm/tests/00-http-docker-swarm && \ NODES=3 MTU=9000 ./test ``` ### Graphical workloads with VNC Launch Ubuntu: ```console cat </dev/null & sleep 5; docker run --rm hello-world'" docker rm -fv ubuntu-docker-sysbox ``` - [Watch on Asciinema](https://asciinema.org/a/630032) **Nested RunCVM demo** - Launch Ubuntu running Systemd and Docker with RunCVM runtime installed; then within it run an Alpine _RunCVM_ Container/VM; and, within that install dockerd and, _within that_, run a container from the 'hello-world' image: ```console cat <>/etc/modules ENTRYPOINT ["/lib/systemd/systemd"] ENV RUNCVM_DISKS='/disks/docker,/var/lib/docker,ext4,1G' VOLUME /disks EOF docker run -d --runtime=runcvm -m 2g --name=ubuntu-docker-runcvm ubuntu-docker-runcvm docker exec ubuntu-docker-runcvm bash -c "docker run --rm --runtime=runcvm alpine ash -x -c 'apk add docker; dockerd &>/dev/null & sleep 5; docker run --rm hello-world'" docker rm -fv ubuntu-docker-runcvm ``` **Docker+GVisor runtime demo** - Launch Ubuntu running Systemd and Docker with GVisor runtime; then within it run the 'hello-world' image in a _GVisor_ container: ```console cat </etc/apt/sources.list.d/gvisor.list && \ apt update && \ apt-get install -y runsc RUN [ ! -f /etc/docker/daemon.json ] && echo '{}' > /etc/docker/daemon.json; cat /etc/docker/daemon.json | jq '.runtimes.runsc.path="/usr/bin/runsc"' | tee /etc/docker/daemon.json ENTRYPOINT ["/lib/systemd/systemd"] ENV RUNCVM_DISKS='/disks/docker,/var/lib/docker,ext4,1G' VOLUME /disks EOF docker run -d --runtime=runsc -m 2g --name=ubuntu-docker-gvisor ubuntu-docker-gvisor docker exec ubuntu-docker-gvisor bash -c "docker run --rm --runtime=runsc hello-world" docker rm -fv ubuntu-docker-gvisor ``` **Launch [OpenWrt](https://openwrt.org/)** - with port forward to LuCI web UI on port 10080: ```console docker import --change='ENTRYPOINT ["/sbin/init"]' https://archive.openwrt.org/releases/23.05.2/targets/x86/generic/openwrt-23.05.2-x86-generic-rootfs.tar.gz openwrt-23.05.2 && \ docker network create --subnet 172.128.0.0/24 runcvm-openwrt && \ echo -e "config interface 'loopback'\n\toption device 'lo'\n\toption proto 'static'\n\toption ipaddr '127.0.0.1'\n\toption netmask '255.0.0.0'\n\nconfig device\n\toption name 'br-lan'\n\toption type 'bridge'\n\tlist ports 'eth0'\n\nconfig interface 'lan'\n\toption device 'br-lan'\n\toption proto 'static'\n\toption ipaddr '172.128.0.5'\n\toption netmask '255.255.255.0'\n\toption gateway '172.128.0.1'\n" >/tmp/runcvm-openwrt-network && \ docker run -it --rm --runtime=runcvm --name=openwrt --network=runcvm-openwrt --ip=172.128.0.5 -v /tmp/runcvm-openwrt-network:/etc/config/network -p 10080:80 openwrt-23.05.2 ``` - [Watch on Asciinema](https://asciinema.org/a/631857) ## RunCVM-in-Portainer walk-through [![Playing around with RunCVM, a docker runtime plugin](https://i.ytimg.com/vi/OENaWDlCWKg/maxresdefault.jpg)](https://www.youtube.com/watch?v=OENaWDlCWKg "Playing around with RunCVM, a docker runtime plugin") ## Motivation RunCVM was born out of difficulties experienced using the Docker and Podman CLIs to launch [Kata Containers v2](https://katacontainers.io/), and a belief that launching containerised workloads in VMs using Docker needn't be so complicated. > Motivations included: efforts to [re-add OCI CLI commands for docker/podman](https://github.com/kata-containers/kata-containers/issues/722) to Kata v2 to support Docker & Podman; other Kata issues [#3358](https://github.com/kata-containers/kata-containers/issues/3358), [#1123](https://github.com/kata-containers/kata-containers/issues/1123), [#1133](https://github.com/kata-containers/kata-containers/issues/1133), [#3038](https://github.com/kata-containers/runtime/issues/3038); [#5321](https://github.com/kata-containers/runtime/issues/5321); [#6861](https://github.com/kata-containers/runtime/issues/6861); Podman issues [#8579](https://github.com/containers/podman/issues/8579) and [#17070](https://github.com/containers/podman/issues/17070); and Kubernetes issue [#40114](https://github.com/kubernetes/website/issues/40114); though please note, since authoring RunCVM some of these issues may have been resolved. Like Kata, RunCVM aims to be a secure container runtime with lightweight virtual machines that feel and perform like containers, but provide stronger workload isolation using hardware virtualisation technology. However, while Kata aims to launch standard container images inside a restricted-privileges namespace inside a VM running a single fixed and heavily customised kernel and Linux distribution optimised for this purpose, RunCVM intentionally aims to launch container _or VM_ images as the _VM's root filesystem_ using stock or bespoke Linux kernels, the upshot being RunCVM's can run VM workloads that Kata's security and kernel model would explicitly prevent. For example: - RunCVM can launch system images expecting to interface directly with hardware, like [OpenWRT](https://openwrt.org/) - RunCVM can launch VMs nested inside a RunCVM VM - i.e. an 'inner' RunCVM Container/VM guest can be launched by Docker running within an 'outer' RunCVM Container/VM guest (assuming the host supports nested VMs) - in this sense, RunCVM is 'reentrant'. RunCVM features: - Compatible with `docker run` (with experimental support for `podman run`). - Uses a lightweight 'wrapper-runtime' technology that subverts the behaviour of the standard container runtime `runc` to cause a VM to be launched within the container (making its code footprint and external dependencies extremely small, and its internals extremely simple and easy to understand and tailor for specific purposes). - Highly portable among Linux distributions and development platforms providing KVM. Can even be installed on [GitHub Codespaces](https://github.com/features/codespaces)! - Written, using off-the-shelf open-source components, almost entirely in shell script for simplicity, portability and ease of development. > RunCVM makes some trade-offs in return for this simplicity. See the full list of [features and limitations](#features-and-limitations). ## Contents - [Introduction](#introduction) - [Quick start](#quick-start) - [Motivation](#motivation) - [Licence](#licence) - [Project aims](#project-aims) - [Project ambitions](#project-ambitions) - [Applications for RunCVM](#applications-for-runcvm) - [How RunCVM works](#how-runcvm-works) - [System requirements](#system-requirements) - [Installation](#installation) - [Upgrading](#upgrading) - [Features and Limitations](#features-and-limitations) - [RunCVM vs Kata comparison](#runcvm-vs-kata-comparison) - [Kernel selection](#kernel-selection) - [Option reference](#option-reference) - [Advanced usage](#advanced-usage) - [Developing](#developing) - [Building](#building) - [Testing](#testing) - [Contributing](#contributing) - [Support](#support) - [Uninstallation](#uninstallation) - [Legals](#Legals) ## Licence RunCVM is free and open-source, licensed under the Apache Licence, Version 2.0. See the [LICENSE](LICENSE) file for details. ## Project aims - Run any standard container workload in a VM using `docker run` with no need to customise images or the command line (except adding `--runtime=runcvm`) - Run unusual container workloads, like `dockerd` and `systemd` that will not run in standard container runtimes - Maintain a similar experience within a RunCVM VM as within a container: process table, network interfaces, stdio, exit code handling should be broadly similar to maximise compatibility - Container start/stop/kill semantics respected, where possible providing clean VM shutdown on stop - VM console accessible as one would expect using `docker run -it`, `docker start -ai` and `docker attach` (and so on), generally good support for other `docker container` subcommands - Efficient container startup, by using virtiofs to serve a container's filesystem directly to a VM (instead of unpacking an image into a backing file) - Improved security compared to the standard container runtime, and as much security as possible without compromising the simplicity of the implementation - Command-line and image-embedded options for customising the a container's VM specifications, devices, kernel - Intelligent kernel selection, according to the distribution used in the image being launched - No external dependencies, except for Docker/Podman and relevant Linux kernel modules (`kvm` and `tun`) - Support multiple Docker network interfaces attached to a created (but not yet running) container using `docker run --network=` and `docker network connect` (excluding IPv6) ## Project ambitions - Support for booting VM with a file-backed disk root fs generated from the container image, instead of only virtiofs root - Support running foreign-architecture VMs by using QEMU dynamic CPU emulation for the entire VM (instead of the approach used by [https://github.com/multiarch/qemu-user-static](https://github.com/multiarch/qemu-user-static) which uses dynamic CPU emulation for each individual binary) - Support for QEMU [microvm](https://qemu.readthedocs.io/en/latest/system/i386/microvm.html) or potentially Amazon Firecracker - More natural console support with independent stdout and stderr channels for `docker run -it` - Improve VM boot time and other behaviours using custom kernel - Support for specific hardware e.g. graphics display served via VNC ## Applications for RunCVM The main applications for RunCVM are: 1. Running and testing applications that: - don't work with (or require enhanced privileges to work with) standard container runtimes (e.g. `systemd`, `dockerd`, Docker swarm services, [Kubernetes](https://kubernetes.io/)) - require a running kernel, or a kernel version or modules not available on the host - require specific hardware that can be emulated e.g. disks, graphics displays 2. Running existing container workloads with increased security 3. Testing container workloads that are already intended to launch in VM environments, such as on [fly.io](https://fly.io) 4. Developing any of the above applications in [Dockside](https://dockside.io/) (see [RunCVM and Dockside](#runcvm-and-dockside)) ## How RunCVM works RunCVM's 'wrapper' runtime, `runcvm-runtime`, receives container create commands triggered by `docker` `run`/`create` commands, modifies the configuration of the requested container in such a way that the created container will launch a VM that boots from the container's filesystem, and then passes the request on to the standard container runtime (`runc`) to actually create and start the container. For a deep dive into RunCVM's internals, see the section on [Developing RunCVM](#developing). ## System requirements RunCVM should run on any amd64 (x86_64) hardware (or VM) running Linux Kernel >= 5.10, and that supports [KVM](https://www.linux-kvm.org/page/Main_Page) and [Docker](https://docker.com). So if your host can already run [KVM](https://www.linux-kvm.org/page/Main_Page) VMs and [Docker](https://docker.com) then it should run RunCVM. RunCVM has no other host dependencies, apart from Docker (or experimentally, Podman) and the `kvm` and `tun` kernel modules. RunCVM comes packaged with all binaries and libraries it needs to run (including its own QEMU binary). RunCVM is tested on Debian Bullseye and [GitHub Codespaces](https://github.com/codespaces/new?hide_repo_select=true&ref=main&repo=514606231). ### rp_filter sysctl settings For RunCVM to support Docker DNS within Container/VMs, the following condition on `/proc/sys/net/ipv4/conf/` must be met: - the max of `all/rp_filter` and `/rp_filter` should be 0 ('No Source Validation') or 2 (Loose mode as defined in RFC3704 Loose Reverse Path) (where `` is any bridge underpinning a Docker network to which RunCVM Container/VMs will be attached) This means that: - if `all/rp_filter` will be set to 0, then `/rp_filter` must be set to 0 or 2 (or, if `` is not yet or might not yet have been created, then `default/rp_filter` must be set to 0 or 2) - if `all/rp_filter` will be set to 1, then `/rp_filter` must be set to 2 (or, if `` is not yet or might not yet have been created, then `default/rp_filter` must be set to 2) - if `all/rp_filter` will be set to 2, then no further action is needed At time of writing: - the Debian default is `0`; - the Ubuntu default is `2`; - the Google Cloud Debian image has default `1` and `rp_filter` settings in `/etc/sysctl.d/60-gce-network-security.conf` must be modified or overridden to support RunCVM. We recommend `all/rp_filter` be set to 2, as this is the simplest change and provides a good balance of security. ## Installation Run: ```sh curl -s -o - https://raw.githubusercontent.com/newsnowlabs/runcvm/main/runcvm-scripts/runcvm-install-runtime.sh | sudo sh ``` This will: - Install the RunCVM software package to `/opt/runcvm` (installation elsewhere is currently unsupported) - For Docker support: - Enable the RunCVM runtime, by patching `/etc/docker/daemon.json` to add `runcvm` to the `runtimes` property - Restart `dockerd`, if it can be detected how for your system (e.g. `systemctl restart docker`) - Verify that RunCVM is recognised via `docker info` - For Podman support (experimental) - Display instructions on patching `/etc/containers/containers.conf` - Check your system network device `rp_filter` settings, and amend them if necessary Following installation, launch a basic test RunCVM Container/VM: ```console docker run --runtime=runcvm --rm -it hello-world ``` ### Install on Google Cloud Create an image that will allow instances to have VMX capability: ```console gcloud compute images create debian-12-vmx --source-image-project=debian-cloud --source-image-family=debian-12 --licenses="https://compute.googleapis.com/compute/v1/projects/vm-options/global/licenses/enable-vmx" ``` Now launch a VM, install Docker and RunCVM: ```console cat >/tmp/startup-script.sh </etc/docker/daemon.json && \ curl -fsSL https://get.docker.com | bash && \ curl -s -o - https://raw.githubusercontent.com/newsnowlabs/runcvm/main/runcvm-scripts/runcvm-install-runtime.sh | sudo REPO=newsnowlabs/runcvm:latest sh EOF gcloud compute instances create runcvm-vmx-test --zone=us-central1-a --machine-type=n2-highmem-2 --network-interface=network-tier=PREMIUM,stack-type=IPV4_ONLY,subnet=default --metadata-from-file=startup-script=/tmp/startup-script.sh --no-restart-on-failure --maintenance-policy=TERMINATE --provisioning-model=SPOT --instance-termination-action=STOP --no-service-account --no-scopes --create-disk=auto-delete=yes,boot=yes,image=debian-12-vmx,mode=rw,size=50,type=pd-ssd --no-shielded-secure-boot --shielded-vtpm --shielded-integrity-monitoring --labels=goog-ec-src=vm_add-gcloud --reservation-affinity=any ``` ## Upgrading To upgrade, follow this procedure: 1. Stop all RunCVM containers. 2. Run `/opt/runcvm/scripts/runcvm-install-runtime.sh` (or rerun the installation command - which runs the same script) 3. Start any RunCVM containers. ## Features and limitations In the below summary of RunCVM's current main features and limitations, [+] is used to indicate an area of compatibility with standard container runtimes and [-] is used indicate a feature of standard container runtimes that is unsupported. > N.B. `docker run` and `docker exec` options not listed below are unsupported and their effect, if used, is unspecified. - `docker run` - Mounts - [+] `--mount` (or `-v`) is supported for volume mounts, tmpfs mounts, and host file and directory bind-mounts (the `dst` mount path `/disks` is reserved) - [-] Bind-mounting host sockets or devices, and `--device` is unsupported - Networking - [+] The default bridge network is supported - [+] Custom/user-defined networks specified using `--network` are supported, including Docker DNS resolution of container names and respect for custom network MTU - [+] Multiple network interfaces - when attached via `docker run --network` or `docker network connect` (but only to a created and not yet running container) - are supported (including `scope=overlay` networks and those with multiple subnets) - [+] `--publish` (or `-p`) is supported - [+] `--dns`, `--dns-option`, `--dns-search` are supported - [+] `--ip` is supported - [+] `--hostname` (or `-h`) is supported - [-] `docker network connect` on a running container is not supported - [-] `--network=host` and `--network=container:name|id` are not supported - [-] IPv6 is not supported - Execution environment - [+] `--user` (or `-u`) is supported - [?] `--workdir` (or `-w`) is supported - [+] `--env` (or `-e`), `--env-file` is supported - [+] `--entrypoint` is supported - [+] `--init` - is supported (but runs RunCVM's own VM init process rather than Docker's default, `tini`) - stdio/Terminals - [+] `--detach` (or `-d`) is supported - [+] `--interactive` (or `-i`) is supported - [+] `--tty` (or `-t`) is supported (but to enter CTRL-T one must press CTRL-T twice) - [+] `--attach` (or `-a`) is supported - [+] Stdout and Stderr output should be broadly similar to running the same workload in a standard `runc` container - [-] Stdout and Stderr are not independently multiplexed so `docker run --runtime=runcvm debian bash -c 'echo stdout; echo stderr >&2' >/tmp/stdout 2>/tmp/stderr` does not produce the expected result - [-] Stdout and Stderr sent very soon after VM launch might be corrupted due to serial console issues - [-] Stdout and Stderr sent immediately before VM shutdown might not always be fully flushed - Resource allocation and limits - [+] `--cpus` is supported to specify number of VM CPUs - [+] `--memory` (or `-m`) is supported to specify VM memory (and limit container memory to this value plus a 256Mb contingency) - [-] Other container resource limit options such as (`--cpu-*`), block IO (`--blkio-*`), kernel memory (`--kernel-memory`) are unsupported or untested - Exit code - [+] Returning an exit code is supported, but it currently requires application support, which must either write the exit code to `/.runcvm/exitcode` (supported exit codes 0-255) or call `/opt/runcvm/sbin/qemu-exit

` (supported exit codes 0-127). Automatic handling of exit codes from the entrypoint may be provided in a later release.
   - Disk performance
      - [+] No mountpoints are required for basic operation for most applications. Volume or disk mountpoints may be needed for running `dockerd` or to improve disk performance
      - [-] `dockerd` mileage will vary unless a volume or disk is mounted over `/var/lib/docker`
- `docker exec`
   - [+] `--user` (or `-u`), `--workdir` (or `-w`), `--env` (or `-e`), `--env-file`, `--detach` (or `-d`), `--interactive` (or `-i`) and `--tty` (or `-t`) are all supported
   - [+] Stdout and Stderr _are_ independently multiplexed so `docker exec  bash -c 'echo stdout; echo stderr >&2' >/tmp/stdout 2>/tmp/stderr` _does_ produce the expected result
- Security
   - The RunCVM software package at `/opt/runcvm` is mounted read-only within RunCVM containers. Container applications cannot compromise RunCVM, but they can execute binaries from within the RunCVM package. The set of binaries available to the VM may be reduced to a minimum in a later version.
- Kernels
   - [+] Use any kernel, either one pre-packaged with RunCVM or roll your own
   - [+] RunCVM will try to select an appropriate kernel to use based on examination of `/etc/os-release` within the image being launched.
- Firmware
  - [+] [SeaBIOS](https://github.com/qemu/seabios)
  - [+] [OVMF EFI](https://github.com/tianocore/tianocore.github.io/wiki/OVMF)

## RunCVM vs Kata comparison

This table provides a high-level comparison of RunCVM and Kata across various features like kernels, networking/DNS, memory allocation, namespace handling, method of operation, and performance characteristics:

| Feature | RunCVM | Kata |
|---------|--------|------|
| **Methodology** | Boots VM from distribution kernels with container's filesystem directly mounted as root filesystem, using virtiofs. VM setup code and kernel modules are bind-mounted into the container. VM's PID1 runs setup code to reproduce the container's networking environment within the VM before executing the container's original entrypoint. | Boots VM from custom kernel with custom root disk image, mounts the virtiofsd-shared host container filesystem to a target folder and executes the container's entrypoint within a restricted namespace having chrooted to that folder. |
| **Privileges/restrictions** | Container code has full root access to VM and its devices. It may run anything that runs in a VM, mounting filesystems, installing kernel modules, accessing devices. RunCVM helper processes are visible to `ps` etc. | Runs container code inside a VM namespace with restricted privileges. Use of mounts, kernel modules is restricted. Kata helper processes (like kata-agent and chronyd) are invisible to `ps`.|
| **Kernels** | Launches stock Alpine, Debian, Ubuntu kernels. Kernel `/lib/modules` automatically mounted within VM. Install any needed modules without host reconfiguration. | Launches custom kernels. Kernel modules aren't mounted and need host reconfiguration to be installed. |
| **Networking/DNS** | Docker container networking + internal/external DNS out-of-the-box. No support for `docker network connect/disconnect` | DNS issues presented: with custom network, external ping works, but DNS lookups fail both for internal docker hosts and external hosts.[^1] |
| **Memory** | VM assigned and reports total memory as per `--memory ` | VM total memory reported by `free` appears unrelated to `--memory ` specified [^2] |
| **CPUs** | VM assigned and reports CPUs as per `--cpus ` | CPUs must be hardcoded in Kata host config |
| **Performance** | | Custom kernel optimisations may deliver improved startup (~3.2s) or operational performance (~15%) |
| **virtiofsd** | Runs `virtiofsd` in container namespace | Unknown |

[^1]: `docker network create --scope=local testnet >/dev/null && docker run --name=test --rm --runtime=kata --network=testnet --entrypoint=/bin/ash alpine -c 'for n in test google.com 8.8.8.8; do echo "ping $n ..."; ping -q -c 8 -i 0.5 $n; done'; docker network rm testnet >/dev/null` succeeds on `runc` and `runcvm` but at time of writing (2023-12-31) the DNS lookups needed fail on `kata`.
    ```
    $ docker network create --scope=local testnet >/dev/null && docker run --name=test --rm -it --runtime=kata --network=testnet --entrypoint=/bin/ash alpine -c 'for n in test google.com 8.8.8.8; do echo "ping $n ..."; ping -q -c 8 -i 0.5 $n; done'; docker network rm testnet >/dev/null
    ping test ...
    ping: bad address 'test'
    ping google.com ...
    ping: bad address 'google.com'
    ping 8.8.8.8 ...
    PING 8.8.8.8 (8.8.8.8): 56 data bytes

    --- 8.8.8.8 ping statistics ---
    8 packets transmitted, 8 packets received, 0% packet loss
    round-trip min/avg/max = 0.911/1.716/3.123 ms
    
    $ docker network create --scope=local testnet >/dev/null && docker run --name=test --rm -it --runtime=runcvm --network=testnet --entrypoint=/bin/ash alpine -c 'for n in test google.com 8.8.8.8; do echo "ping $n ..."; ping -q -c 8 -i 0.5 $n; done'; docker network rm testnet >/dev/null
    ping test ...
    PING test (172.25.8.2): 56 data bytes

    --- test ping statistics ---
    8 packets transmitted, 8 packets received, 0% packet loss
    round-trip min/avg/max = 0.033/0.085/0.137 ms
    ping google.com ...
    PING google.com (172.217.16.238): 56 data bytes

    --- google.com ping statistics ---
    8 packets transmitted, 8 packets received, 0% packet loss
    round-trip min/avg/max = 8.221/8.398/9.017 ms
    ping 8.8.8.8 ...
    PING 8.8.8.8 (8.8.8.8): 56 data bytes

    --- 8.8.8.8 ping statistics ---
    8 packets transmitted, 8 packets received, 0% packet loss
    round-trip min/avg/max = 1.074/1.491/1.801 ms
    ```

[^2]: `docker run --rm -it --runtime=kata --entrypoint=/bin/ash -m 500m alpine -c 'free -h; df -h /dev/shm'`
    ```
    $ docker run --rm --runtime=kata --name=test -m 2g --env=RUNCVM_KERNEL_DEBUG=1 -it alpine ash -c 'free -h'
                total        used        free      shared  buff/cache   available
    Mem:           3.9G       94.4M        3.8G           0        3.7M        3.8G
    Swap:             0           0           0
    $ docker run --rm --runtime=kata --name=test -m 3g --env=RUNCVM_KERNEL_DEBUG=1 -it alpine ash -c 'free -h'
                total        used        free      shared  buff/cache   available
    Mem:           4.9G      107.0M        4.8G           0        3.9M        4.8G
    Swap:             0           0           0
    $ docker run --rm --runtime=kata --name=test -m 0g --env=RUNCVM_KERNEL_DEBUG=1 -it alpine ash -c 'free -h'
                total        used        free      shared  buff/cache   available
    Mem:           1.9G       58.8M        1.9G           0        3.4M        1.9G
    Swap:             0           0           0
    ```

## Kernel auto-detection

When creating a container, RunCVM will examine the image being launched to try to determine a suitable kernel to boot the VM with. Its process is as follows:

1. If `--env=RUNCVM_KERNEL=[/]` specified, use the indicated kernel
2. Otherwise, identify distro from `/etc/os-release`
   1. If one is found in the appropriate distro-specific location in the image, select an in-image kernel. The locations are:
      - Debian: `/vmlinuz` and `/initrd.img`
      - Ubuntu: `/boot/vmlinuz` and `/boot/initrd.img`
      - Alpine: `/boot/vmlinuz-virt` `/boot/initramfs-virt`
   2. Otherwise, if found in the RunCVM package, select the latest kernel compatible with the distro
   3. Finally, use the Debian kernel from the RunCVM package

## Option reference

RunCVM options are specified either via standard `docker run` options or via  `--env==` options on the `docker run`
command line. The following env options are user-configurable:

### `--env=RUNCVM_KERNEL=[/]`

Specify with which RunCVM kernel (from `/opt/runcvm/kernels`) to boot the VM. Values must be of the form `/`, where `` is a directory under `/opt/runcvm/kernels` and `` is a subdirectory (or symlink to a subdirectory) under that. If `` is omitted, `latest` will be assumed. Here is an example command that will list available values of `/` on your installation.

```console
$ find /opt/runcvm/kernels/ -maxdepth 2 | sed 's!^/opt/runcvm/kernels/!!; /^$/d'
debian
debian/latest
debian/5.10.0-16-amd64
alpine
alpine/latest
alpine/5.15.59-0-virt
ubuntu
ubuntu/latest
ubuntu/5.15.0-43-generic
ol
ol/5.14.0-70.22.1.0.1.el9_0.x86_64
ol/latest
```

Example:

```console
docker run --rm --runtime=runcvm --env=RUNCVM_KERNEL=ol hello-world
```

### `--env=RUNCVM_KERNEL_APPEND=1`

Any custom kernel command line options e.g. `apparmor=0` or `systemd.unified_cgroup_hierarchy=0`.

### `--env='RUNCVM_DISKS=[;;...]'`

Automatically create, format, prepopulate and mount backing files as virtual disks on the VM.

Each `` should be a comma-separated list of values of the form: `,,[,]`.

- `` is the path _within the container_ where the virtual disk backing file should be located. This may be in the container's overlayfs or within a volume (mounted using `--mount=type=volume`).
- `` is both (a) the path within the VM where the virtual disk should be mounted; and (b) the location of the directory with which contents the disk should be prepopulated.
- `` is the filesystem with which the backing disk should be formatted when first created.
- `` is the size of the backing file (in `truncate` format), and must be specified if `` does not exist.

When first created, the backing file will be created as a sparse file to the specified `` and formatted with the specified `` using `mke2fs` and prepopulated with any files preexisting at ``.

When RunCVM creates a Container/VM, fstab entries will be drafted. After the VM boots, the fstab entries will be mounted. Typically, the first disk will be mounted as `/dev/vda`, the second as `/dev/vdb`, and so on.

#### Example #1

```console
docker run -it --runtime=runcvm --env=RUNCVM_DISKS=/disk1,/home,ext4,5G 
```

In this example, RunCVM will check for existence of a file at `/disk1` within ``, and if not found create a 5G backing file (in the container's filesystem, typically overlay2) with an ext4 filesystem prepopulated with any preexisting contents of `/home`, then add the disk to `/etc/fstab` and mount it within the VM at `/home`.

#### Example #2

```console
docker run -it --runtime=runcvm --mount=type=volume,src=runcvm-disks,dst=/disks --env='RUNCVM_DISKS=/disks/disk1,/home,ext4,5G;/disks/disk2,/opt,ext4,2G' 
```

This example behaves similarly, except that the `runcvm-disks` persistent Docker volume is first mounted at `/disks` within the container's filesystem, and therefore the backing files at `/disks/disk1` and `/disks/disk2` (mounted in the VM at `/home` and `/opt` respectively) are stored in the _persistent volume_ (typically stored in `/var/lib/docker` on the host, bypassing overlay2).

> N.B. `/disks` and any paths below it are _reserved mountpoints_. Unlike other mountpoints, these are *NOT* mounted into the VM but only into the container, and are therefore suitable for use for mounting VM disks from bscking files that cannot be accessed within the VM's filesystem.

### `--env=RUNCVM_DISPLAY_MODE=`

Shortcut to select a display mode:
- `headless` is traditional headless runc-like behaviour over hvc0;
- `serial` is similar over `ttyS0`;
- `vnc`, enables launches a VNC server over a VGA device (`virtio`, the default; or `std`) on VNC display `RUNCVM_QEMU_VNC_DISPLAY` (default display is `0` aka `:0`) with virtual tablet and audio device.

> N.B. Use `--env=RUNCVM_DISPLAY_MODE=vnc` always with `-p :` where ` =  + 5900` and `` is 0 (unless, optionally, `--env=RUNCVM_QEMU_VNC_DISPLAY=` is used to specify a different `` >= 0) to initiate a VNC server to which a VNC client can connect on host port ``.
>
> e.g. `docker run --runtime=runcvm --env=RUNCVM_DISPLAY_MODE=vnc -p 5900:5900 ubuntu`

### `--env=RUNCVM_QEMU_VGA=`

Selects a QEMU guest VGA adaptor: `none`, `virtio`, `std`, `cirrus`. Can be used with `--env=RUNCVM_DISPLAY_MODE=vnc` to override default `virtio` VGA device.

### `--env=RUNCVM_QEMU_VNC_DISPLAY=`

With `--env=RUNCVM_DISPLAY_MODE=vnc`, overrides the VNC display number (and hence ``).

With `--env=RUNCVM_DISPLAY_MODE=headless`, specifies also that a VNC server should also be launched with the given display number.

### `--env=RUNCVM_QEMU_DISPLAY=` [experimental]

Specify a QEMU frontend display. Normally RunCVM runs headless, without any frontend display, so the default is `none`. Currently only `curses` is supported.

### `--env=RUNCVM_QEMU_USB=`

Enable USB interface.

### `--env=RUNCVM_BIOS=EFI`

By default SeaBIOS is used to boot the VM. Enable OVMF EFI boot with this option.

### `--env=RUNCVM_QEMU_NET_VHOST=1`

Enable use of [virtio vhost-net](https://www.redhat.com/en/blog/introduction-virtio-networking-and-vhost-net) (reliant on host `vhost_net` module and `/dev/vhost-net` device) to accelerate networking.

### `--env=RUNCVM_SYS_ADMIN=1`

By default, `virtiofsd` is not launched with `-o modcaps=+sys_admin` (and containers are not granted `CAP_SYS_ADMIN`). Use this option if you need to change this.

### `--env=RUNCVM_KERNEL_MOUNT_LIB_MODULES=1`

If a RunCVM kernel (as opposed to an in-image kernel) is chosen to launch a VM, by default that kernel's modules will be mounted at `/lib/modules/` in the VM. If this variables is set, that kernel's modules will instead be mounted over `/lib/modules`.

### `--env=RUNCVM_KERNEL_DEBUG=1`

Enable kernel logging (sets kernel `console=ttyS0`).

### `--env=RUNCVM_BIOS_DEBUG=1`

By default BIOS console output is hidden. Enable it with this option. With `--env=RUNCVM_BIOS=EFI`, this option has no effect.

### `--env=RUNCVM_RUNTIME_DEBUG=1`

Enable debug logging for the runtime (the portion of RunCVM directly invoked by `docker run`, `docker exec` etc).
Debug logs are written to files in `/tmp`.

### `--env=RUNCVM_QEMU_DEBUG=1`

Enable logging the exit code from QEMU and OOM statistics.

### `--env=RUNCVM_BREAK=`

Enable breakpoints (falling to bash shell) during the RunCVM Container/VM boot process.

`` must be a comma-separated list of: `prenet`, `postnet`, `preqemu`.

### `--env=RUNCVM_HUGETLB=1`

**[EXPERIMENTAL]** Enable use of preallocated hugetlb memory backend, which can improve performance in some scenarios.

### `--env=RUNCVM_QEMU_MEM_PREALLOC=`

Specify whether QEMU VM memory should be preallocated, or allocated on the host only as required by the guest.

Default is `off`, but `on` may deliver improved performance in some scenarios at expense of ability to overcommit.

### `--env=RUNCVM_VIRTIOFSD_CACHE=`

Specifies the cache policy passed to `virtiofsd` via `-o cache=`. The default policy is `auto`. Use `none` to avoid caching issues when multiple RunCVM instances need to share access to a volume or bind-mount on the host.

### `--env=RUNCVM_CGROUPFS=`

Configures cgroupfs mountpoints in the VM, which may be needed to run applications like Docker if systemd is not running. Acceptable values are:

- `none`/`systemd` - do nothing; leave to the application or to systemd (if running)
- `1`/`cgroup1` - mount only cgroup v1 filesystems supported by the running kernel to subdirectories of `/sys/fs/cgroup`
- `2`/`cgroup2` - mount only cgroup v2 filesystem to `/sys/fs/cgroup`
- `hybrid`/`mixed` - mount cgroup v1 filesystems and mount cgroup v2 filesystem to `/sys/fs/cgroup/unified`

Please note that if `RUNCVM_CGROUPFS` is left undefined or set to an empty string, then RunCVM selects an appropriate
default behaviour according to these rules:

- If specified entrypoint (or, if a symlink, its target) matches the regex `/systemd$` then assume a default value of `none`;
- Else, assume a default value of `hybrid`.

These rules work well in the cases of running Docker in (a) stock Alpine/Debian/Ubuntu distributions in which Docker has been installed but Systemd is not running; and (b) distributions in which Systemd is running. Of course you should set `RUNCVM_CGROUPFS` if you need to override the default behaviour.

Please also note that in the case your distribution is running Systemd you may instead set `--env=RUNCVM_KERNEL_APPEND='systemd.unified_cgroup_hierarchy='` (where `` is `0` or `1`) to request Systemd to create either hybrid or cgroup2-only cgroup filesystem(s) itself.

## Advanced usage

### Running Docker in a RunCVM Container/VM

#### ext4 disk backing file mounted at `/var/lib/docker`

If running Docker within a VM, it is recommended that you mount a disk backing file at `/var/lib/docker` to allow `dockerd` to use the preferred overlay filesystem and avoid it opting to use the extremely sub-performant `vfs` storage driver.

e.g. To launch a VM with a 1G ext4-formatted backing file, stored in the underlying container's overlay filesystem, and mounted at `/var/lib/docker`, run:

```sh
docker run -it --runtime=runcvm --env=RUNCVM_DISKS=/disks/docker,/var/lib/docker,ext4,1G 
```

To launch a VM with a 5G ext4-formatted backing file, stored in a dedicated Docker volume on the host, and mounted at `/var/lib/docker`, run:

```sh
docker run -it --runtime=runcvm --mount=type=volume,src=runcvm-disks,dst=/disks --env=RUNCVM_DISKS=/disks/docker,/var/lib/docker,ext4,5G 
```

In both cases, RunCVM will check for existence of a file `/disks/docker` and, if not found, will create the disk backing file of the given size and format as an ext4 filesystem. It will add the disk to `/etc/fstab`.

For full documentation of `RUNCVM_DISKS`, see above.

#### Docker volume mounted at `/var/lib/docker` (NOT RECOMMENDED)

Doing this is _not recommended_, but if running Docker within a VM, you can enable `dockerd` to use the overlay filesystem (at the cost of security) by launching with `--env=RUNCVM_SYS_ADMIN=1`. e.g. 

```sh
docker run --runtime=runcvm --mount=type=volume,src=mydocker1,dst=/var/lib/docker --env=RUNCVM_SYS_ADMIN=1 
```

> N.B. This option adds `CAP_SYS_ADMIN` capabilities to the container and then launches `virtiofsd` with `-o modcaps=+sys_admin`. 

## Developing

The following deep dive should help explain the inner workings of RunCVM, and which files to modify to implement fixes, improvements and extensions.

### runcvm-runtime

RunCVM's 'wrapper' runtime, `runcvm-runtime`, intercepts container `create` and `exec` commands and their specifications in JSON format (`config.json` and `process.json` respectively) that are normally provided (by `docker` `run`/`create` and `docker exec` respectively) to a standard container runtime like `runc`.

The JSON file is parsed to retrieve properties of the command, and is modified to allow RunCVM to piggyback by overriding the originally intended behaviour with new behaviour.

The modifications to `create` are designed to make the created container launch a VM that boots off the container's filesystem, served using `virtiofsd`.

The modifications to `exec` are designed to run commands within the VM instead of the container.

#### `runcvm-runtime` - `create` command

In more detail, the RunCVM runtime `create` process:
- Modifies the `config.json` file to:
   - Modify the container's entrypoint, to prepend `runcvm-ctr-entrypoint` to the container's original entrypoint and if an `--init` argument was detected, remove any init process and set the container env var `RUNCVM_INIT` to `1`
   - Set the container env var `RUNCVM_UIDGID` to `::` as intended for the container, then resets both the `` and `` to `0`.
   - Set the container env var `RUNCVM_CPUS` to the intended `--cpus` count so it can be passed to the VM
   - Extract and delete all requested tmpfs mounts (these will be independently mounted by the VM).
   - Add a bind mount from `/` to `/vm` that will recursively mount the following preceding mounts:
      - A bind mount from `/opt/runcvm` on the host to `/opt/runcvm` in the container.
      - A tmpfs mounted at `/.runcvm`
   - Add a tmpfs at `/run` in the container only.
   - Map all requested bind mounts from their original mountpoint `` to `/vm/` (except where `` is at or below `/disks`).
   - Determine a suitable VM launch kernel by looking for one inside the container's image, choosing a stock RunCVM kernel matching the image, or according to an env var argument.
      - Add a bind mount to `/vm/lib/modules/` for the kernel's modules
      - Set container env vars `RUNCVM_KERNEL_PATH`, `RUNCVM_KERNEL_INITRAMFS_PATH` and `RUNCVM_KERNEL_ROOT`
   - Add device mounts for `/dev/kvm` and `/dev/net/tun`.
   - Set the seccomp profile to 'unconfined'.
   - Set `/dev/shm` to the size desired for the VM's memory, set `RUNCVM_MEM_SIZE` env var accordingly, and set the container memory limit to the size plus 256Mb (a contingency for QEMU itself, along with virtiofsd, dnsmasq and other container contents)
   - Add necessary capabilities, if not already present (`NET_ADMIN`, `NET_RAW`, `MKNOD`, `AUDIT_WRITE`).
   - Only if requested by `--env=SYS_ADMIN=1`, add the `SYS_ADMIN` capability.
- Executes the standard container runtime `runc` with the modified `config.json`.

The `runcvm-ctr-entrypoint`:
- Is always launched as PID1 within the standard Docker container.
- Saves the container's originally-intended entrypoint and command line, environment variables and network configuration to files inside `/.runcvm`.
- Creates a bridge (acting as a hub) for each container network interface, to join that interface to a VM tap network interface.
- Launches `virtiofsd` to serve the container's root filesystem.
- Configures `/etc/resolv.conf` in the container.
- Adds container firewall rules, launches `dnsmasq` and modifies `/vm/etc/resolv.conf` to proxy DNS requests from the VM to Docker's DNS.
- Execs RunCVM's own `runcvm-init` init process to supervise `runcvm-ctr-qemu` to launch the VM.

The `runcvm-init` process:
- Is RunCVM's custom init process, that takes over as PID1 within the container, supervising `runcvm-ctr-qemu` to launch the VM.
- Waits for a TERM signal. On receiving one, it spawns `runcvm-ctr-shutdown`, which cycles through a number of methods to try to shut down the VM cleanly.
- Waits for its child (QEMU) to exit. When it does, execs `runcvm-ctr-exit` to retrieve any saved exit code (written by the application to `/.runcvm/exitcode`) and exit with this code.

The `runcvm-ctr-qemu` script:
- Prepares disk backing files as specified by `--env=RUNCVM_DISKS=`
- Prepares network configuration as saved from the container (modifying the MAC address of each container interface)
- Launches [QEMU](https://www.qemu.org/) with the required kernel, network interfaces, disks, display, and with a root filesystem mounted via virtiofs from the container and with `runcvm-vm-init` as the VM's init process.

The `runcvm-vm-init` process:
- Runs as PID1 within the VM.
- Retrieves the container configuration - network, environment, disk and tmpfs mounts - saved by `runcvm-ctr-entrypoint` to `/.runcvm`, and reproduces it within the VM
- Launches the container's pre-existing entrypoint, in one of two ways.
   1. If `RUNCVM_INIT` is `1` (i.e. the container was originally intended to be launched with Docker's own init process) then it configures and execs busybox `init`, which becomes the VM's PID1, to supervise `dropbear`, run `runcvm-vm-start` and `poweroff` the VM if signalled to do so.
   2. Else, it backgrounds `dropbear`, then execs (via `runcvm-init`, purely to create a controlling tty) `runcvm-vm-start`, which runs as the VM's PID1.

The `runcvm-vm-start` script:
- Restores the container's originally-intended environment variables, ``, ``, `` and ``, and execs that entrypoint.

#### `runcvm-runtime` - `exec` command

The RunCVM runtime `exec` process:

- Modifies the `process.json` file to:
   - Retrieve the intended ``, ``, ``, `` and `` for the command, as well as  indicating the existence of a HOME environment variable.
   - Resets both the `` and `` to `0` and the `` to `/`.
   - Prepend `runcvm-ctr-exec '::' '' '' ''` to the originally intended command.
- Executes the standard container runtime `runc` with the modified `process.json`.

The `runcvm-ctr-exec` script:
- Uses the Dropbear `dbclient` SSH client to execute the intended command, with the intended arguments within the VM, via the `runcvm-vm-exec` process, propagate the returned stdout and stderr and return the command's exit code.

## Building

Building RunCVM requires Docker. To build RunCVM, first clone the repo, then run the build script, as follows:

```console
cd runcvm
./build/build.sh
```

The build script creates a Docker image named `newsnowlabs/runcvm:latest`.

Now follow the main [installation instructions](#installation) to install your built RunCVM from the Docker image.

## Testing

Test RunCVM using nested RunCVM. You can do this using a Docker image capable of installing RunCVM, or an image built with a version of RunCVM preinstalled.

Build a suitable image as follows:

```sh
cat <>/etc/modules && \
    useradd --create-home --shell /bin/bash --groups sudo,docker runcvm && \
    echo runcvm:runcvm | chpasswd && \
    echo 'runcvm ALL=(ALL) NOPASSWD: ALL' >/etc/sudoers.d/runcvm

WORKDIR /home/runcvm
ENTRYPOINT ["/lib/systemd/systemd"]
VOLUME /disks

# Mount formatted backing files at:
# - /var/lib/docker for speed and overlay2 support
# - /opt/runcvm to avoid nested virtiofs, which works, but can't be great for speed
ENV RUNCVM_DISKS='/disks/docker,/var/lib/docker,ext4,2G;/disks/runcvm,/opt/runcvm,ext4,2G'

# # Uncomment this block to preinstall RunCVM from the specified image
#
# COPY --from=newsnowlabs/runcvm:latest /opt /opt/
# RUN rm -f /etc/init.d/docker && \
#     bash /opt/runcvm/scripts/runcvm-install-runtime.sh --no-dockerd
EOF
```

(Uncomment the final block to build an image with RunCVM preinstalled, or leave the block commented to test RunCVM installation).

To launch, run:

```sh
docker run -d --runtime=runcvm -m 2g --name=ubuntu-docker-runcvm ubuntu-docker-runcvm
```

> Optionally modify this `docker run` command by:
> - adding `--rm` - to automatically remove the container after systemd shutdown
> - removing `-d` and adding `--env=RUNCVM_KERNEL_DEBUG=1` - to see kernel and systemd boot logs
> - removing `-d` and adding `-it` - to provide a console

Then `docker exec -it -u runcvm ubuntu-docker-runcvm bash` to obtain a command prompt and perform testing.

Run `docker rm -fv ubuntu-docker-runcvm` to clean up after testing.

## Support

**Support launching images:** If you encounter any Docker image that launches in a standard container runtime that does not launch in RunCVM, or launches but with unexpected behaviour, please [raise an issue](https://github.com/newsnowlabs/runcvm/issues) titled _Launch failure for image ``_ or _Unexpected behaviour for image ``_ and include log excerpts and an explanation of the failure, or expected and unexpected behaviour.

**For all other issues:** please still [raise an issue](https://github.com/newsnowlabs/runcvm/issues)

You can also reach out to us on the [NewsNow Labs Slack Workspace](https://join.slack.com/t/newsnowlabs/shared_invite/zt-wp54l05w-0DTxuc_n8uISJRtks3Xw3A).

We are typically available to respond to queries Monday-Friday, 9am-5pm UK time, and will be happy to help.

## Contributing

If you would like to contribute a feature suggestion or code, please raise an issue or submit a pull request.

## Uninstallation

Shut down any RunCVM containers.

Then run `sudo rm -f /opt/runcvm`.

## RunCVM and Dockside

RunCVM and [Dockside](https://dockside.io/) are designed to work together in two alternative ways.

1. Dockside can be used to launch devtainers (development environments) in RunCVM VMs, allowing you to provision containerised online IDEs for developing applications like `dockerd`, Docker swarm, `systemd`, applications that require a running kernel, or kernel modules not available on the host, or specific hardware e.g. a graphics display. Follow the instructions for adding a runtime to your [Dockside profiles](https://github.com/newsnowlabs/dockside/blob/main/docs/setup.md#profiles).
2. Dockside can itself be launched inside a RunCVM VM with its own `dockerd` to provide increased security and compartmentalisation from a host. e.g.

```
docker run --rm -it --runtime=runcvm  --memory=2g --name=docksidevm -p 443:443 -p 80:80 --mount=type=volume,src=dockside-data,dst=/data --mount=type=volume,src=dockside-disks,dst=/disks --env=RUNCVM_DISKS=/disks/disk1,/var/lib/docker,ext4,5G newsnowlabs/dockside --run-dockerd --ssl-builtin
```

## Legals

This project (known as "RunCVM"), comprising the files in this Git repository
(but excluding files containing a conflicting copyright notice and licence),
is copyright 2023 NewsNow Publishing Limited, Struan Bartlett, and contributors.

RunCVM is an open-source project licensed under the Apache License, Version 2.0
(the "License"); you may not use RunCVM or its constituent files except in
compliance with the License.

You may obtain a copy of the License at [http://www.apache.org/licenses/LICENSE-2.0](http://www.apache.org/licenses/LICENSE-2.0).

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

> N.B. In order to run, RunCVM relies upon other third-party open-source software dependencies that are separate to and independent from RunCVM and published under their own independent licences.
>
> RunCVM Docker images made available at [https://hub.docker.com/repository/docker/newsnowlabs/runcvm](https://hub.docker.com/repository/docker/newsnowlabs/runcvm) are distributions
> designed to run RunCVM that comprise: (a) the RunCVM project source and/or object code; and
> (b) third-party dependencies that RunCVM needs to run; and which are each distributed under the terms
> of their respective licences.


================================================
FILE: build/build.sh
================================================
#!/bin/sh -e

REPO=newsnowlabs/runcvm

DOCKER_BUILDKIT=1 docker build -t $REPO .

cat <<_EOE_

RunCVM build successful
=======================

To install or upgrade, now run:

  sudo ./runcvm-scripts/runcvm-install-runtime.sh
_EOE_


echo

================================================
FILE: build-utils/entrypoint-install.sh
================================================
#!/bin/sh

MNT=/runcvm
REPO=newsnowlabs/runcvm

while [ -n "$1" ];
do
  case "$1" in
    --quiet) QUIET=1; shift; continue; ;;
    --sleep|--wait|--pause) SLEEP=1; shift; continue; ;;
    *) echo "$0: Unknown argument '$1'; aborting!"; exit 2; ;;
  esac
done

if ! mountpoint $MNT >/dev/null 2>&1; then

  cat <<_EOE_ >&2
ERROR: Host bind-mount not specified, see below for correct usage.

Usage: docker run --rm -v /opt/runcvm:$MNT $REPO [--quiet] [--sleep]

 - Installs runcvm package to the host at /opt/runcvm
   (installation elsewhere is currently unsupported)

   N.B. This image should normally only be used by the install script.
        See README.md for installation instructions.
_EOE_

  exit 1
fi

rsync -aR --delete /opt/runcvm/./ $MNT/ || exit 1

if [ -z "$QUIET" ]; then

  cat <<"_EOE_" >&2
RunCVM install/upgrade successful
=================================

If this is your first time installing RunCVM on this server/VM, then:

1. Run the following to update /etc/docker/daemon.conf and restart docker:

  sudo /opt/runcvm/scripts/runcvm-install-runtime.sh

2. Optionally, run the integration tests:

  ./tests/run

_EOE_
fi

# For installing across a docker swarm:
# - Run: docker service create --name=runcvm --mode=global --mount=type=bind,src=/opt/runcvm,dst=/runcvm newsnowlabs/runcvm:latest --sleep
# - Wait: until the service is created everywhere
# - Run: docker service rm runcvm
if [ -n "$SLEEP" ]; then
  echo "$(hostname): RunCVM package installed."
  sleep infinity
else
  exit 0
fi


================================================
FILE: build-utils/make-bundelf-bundle.sh
================================================
#!/bin/bash

# BundELF - ELF binary and dynamic library patcher/bundler for making portable/relocatable executables
# ----------------------------------------------------------------------------------------------------
#
# Licence: Apache 2.0
# Authors: Struan Bartlett, NewsNow Labs, NewsNow Publishing Ltd
# Version: 1.1.3
# Git: https://github.com/newsnowlabs/bundelf

# make-bundelf-bundle.sh is used to prepare and package ELF binaries and their 
# dynamic library dependencies for relocation to (and execution from) a new
# location, making them completely portable and independent of the original
# distribution.
#
# It can be used to package Linux binaries sourced from one distribution,
# so that they run within, but completely independently of, any other
# distribution.
#
# Example BundELF use cases:
# - Bundling Alpine binaries for running within, but completely independently
#   of, any arbitrary distribution (including GLIBC-based distributions)
# - Bundling GLIBC-based applications for running within Alpine (or indeed any
#   other distribution)
#
# BundELF is a core technology component of:
# - https://github.com/newsnowlabs/dockside
#   - to allow running complex Node-based IDE applications and container-setup
#     processes inside containers running an unknown arbitrary Linux
#     distribution
# - https://github.com/newsnowlabs/runcvm
#   - to allow running QEMU, virtiofsd, dnsmasq and other tools inside a
#     container (and indeed a VM) running an unknown arbitrary Linux
#     distribution
#
# Environment variable inputs:
# - BUNDELF_BINARIES - list required binaries to be scanned and copied
# - BUNDELF_DYNAMIC_PATHS - list optional paths to be scanned and copied
# - BUNDELF_EXTRA_LIBS - list extra libraries to be scanned and copied
# - BUNDELF_CODE_PATH - path where binaries and libraries will be copied to
# - BUNDELF_EXEC_PATH - path where binaries and libraries will be executed from
# = BUNDELF_MERGE_BINDIRS - non-empty if all specified binaries should be copied to $BUNDELF_CODE_PATH/bin
# - BUNDELF_LIBPATH_TYPE - whether to use absolute or relative paths (the default) for RPATH
# - BUNDELF_NODE_PATH - [optional] path to the node binary, if required to ensure ldd can resolve all library paths in .node files
# - BUNDELF_EXTRA_SYSTEM_LIB_PATHS - [optional] list of extra system library paths to be added to the RPATH
#
# See README.md for full details.

# BUNDELF_EXEC_PATH defaults to BUNDELF_CODE_PATH
BUNDELF_EXEC_PATH="${BUNDELF_EXEC_PATH:-$BUNDELF_CODE_PATH}"

# Whether to use absolute or relative paths for RPATH
BUNDELF_LIBPATH_TYPE="${BUNDELF_LIBPATH_TYPE:-relative}"

# Determine LD filepath, which is architecture-dependent:
# e.g. ld-musl-aarch64.so.1 (linux/arm64), ld-musl-armhf.so.1 (linux/arm/v7), ld-musl-x86_64.so.1 (linux/amd64)
#   or ld-linux-aarch64.so.1 (linux/arm64), ld-linux-armhf.so.3 (linux/arm/v7), ld-linux-x86-64.so.2 (linux/amd64)
LD_PATH=$(ls -1 /lib/ld-musl-* /lib/*-linux-*/ld-linux-*.so.* 2>/dev/null | head -n 1)
LD_BIN=$(basename $LD_PATH)

TMP=/tmp/bundelf.$$

append() {
  while read line; do echo "${line}${1}"; done
}

_verify_interpreter_paths() {
  # Verify interpreter path is correctly set in all ELF binaries
  # Returns 0 if all OK, 1 if any problems found
  local status=0
  echo "Verifying interpreter paths..." >&2

  while IFS= read -r bin; do
    echo -n "- interp: $bin ... " >&2
    local interpreter=$(patchelf --print-interpreter "$bin" 2>/dev/null)
    if [ "$interpreter" != "$BUNDELF_EXEC_PATH$LD_PATH" ]; then
      echo "BAD (interpreter: $interpreter)" >&2
      status=1
    else
      echo "GOOD" >&2
    fi
  done < "$BUNDELF_CODE_PATH/.binelfs"
  return $status
}

_verify_rpath_settings() {
  # Verify RPATH settings match expected patterns for relative/absolute mode
  # Returns 0 if all OK, 1 if any problems found

  local BUNDELF_CODE_PATH_REGEX=$(escape_regex "$BUNDELF_CODE_PATH")

  local status=0
  echo "Verifying RPATH settings..." >&2

  while IFS= read -r file; do
    echo -n "- RPATH: $file ... " >&2
    local rpath=$(patchelf --print-rpath "$file" 2>/dev/null)

    if [ "$BUNDELF_LIBPATH_TYPE" = "absolute" ]; then
      # For absolute mode, all RPATHs should start with BUNDELF_CODE_PATH (or be empty, should no dynamic libraries be referenced by any bundled binaries)
      if ! echo "$rpath" | grep -qE "^($BUNDELF_CODE_PATH_REGEX|$)"; then
        echo "BAD (expected absolute path)" >&2
        status=1
      else
        echo "GOOD" >&2
      fi
    else
      # For relative mode, all RPATHs should use $ORIGIN (or be empty, should no dynamic libraries be referenced by any bundled binaries)
      if ! echo "$rpath" | grep -qE '^(\$ORIGIN|$)'; then
        echo "BAD (expected \$ORIGIN)" >&2
        status=1
      else
        echo "GOOD" >&2
      fi
    fi
  done < <(cat "$BUNDELF_CODE_PATH/.binelfs" "$BUNDELF_CODE_PATH/.libelfs")
  return $status
}

_verify_symlinks() {
  # Check for broken symlinks within the bundle
  # Returns 0 if all OK, 1 if any problems found
  local status=0
  echo "Verifying symlinks..." >&2

  while IFS= read -r link; do
    echo -n "- symlink: $link ... " >&2
    if ! [ -e "$link" ]; then
      echo "BAD (broken link)" >&2
      status=1
    else
      echo "GOOD" >&2
    fi
  done < <(find "$BUNDELF_CODE_PATH" -type l)

  return $status
}

_verify_library_resolution() {
  # Check that all dynamic library dependencies are correctly being resolved to versions stored within BUNDELF_CODE_PATH.
  # Returns 0 if all OK, 1 if any problems found
  local status=0
  echo "Verifying library resolution..." >&2

  local BUNDELF_CODE_PATH_REGEX=$(escape_regex "$BUNDELF_CODE_PATH")
  local BUNDELF_EXEC_PATH_REGEX=$(escape_regex "$BUNDELF_EXEC_PATH")
  local LD_BIN_REGEX=$(escape_regex "$LD_BIN")

  while IFS= read -r lib; do
    echo -n "- lib: $lib ... " >&2
    $BUNDELF_CODE_PATH$LD_PATH --list $lib 2>/dev/null | sed -nr '/=>/!d; s/^\s*(\S+)\s*=>\s*(.*?)(\s*\(0x[0-9a-f]+\))?$/- \2 \1/;/^.+$/p;' | egrep -v -- "^- ($BUNDELF_CODE_PATH_REGEX/|$BUNDELF_EXEC_PATH_REGEX/.*/$LD_BIN_REGEX)"
    
    if [ $? -eq 0 ]; then
      status=1
      echo "BAD" >&2
    else
      echo "GOOD" >&2
    fi
    sleep 0.01
  done < <(cat "$BUNDELF_CODE_PATH/.binelfs" "$BUNDELF_CODE_PATH/.libelfs")
  return $status
}

verify() {
  local final_status=0

  # Deduce BUNDELF_CODE_PATH from this script's execution path, when needed (useful when called with --verify within an alternative environment).
  [ -z $BUNDELF_CODE_PATH ] && BUNDELF_CODE_PATH=$(realpath $(dirname $0)/..)

  # Fast verifications
  _verify_interpreter_paths || final_status=1
  _verify_symlinks || final_status=1
  _verify_rpath_settings || final_status=1
  _verify_library_resolution || final_status=1

  if [ $final_status -eq 0 ]; then
    echo "All verifications passed successfully." >&2
  fi
  exit $final_status
}

copy_binaries() {
  # Copy any binaries we require to the install location, outputing their new paths.

  if [ -n "$BUNDELF_MERGE_BINDIRS" ]; then
    mkdir -p $BUNDELF_CODE_PATH/bin
  else
    mkdir -p $BUNDELF_CODE_PATH
  fi

  for bin in "$@"
  do
    local file="$(which "$bin")"
    local basename="$(basename "$file")"

    if [ -n "$file" ]; then
      if [ -z "$BUNDELF_MERGE_BINDIRS" ]; then
        cp -a --dereference --parents $file $BUNDELF_CODE_PATH
        echo "$BUNDELF_CODE_PATH$file"
      else
        cp -p --dereference $file $BUNDELF_CODE_PATH/bin/
        echo "$BUNDELF_CODE_PATH/bin/$basename"
      fi
    fi
  done
}

scan_extra_libs() {
  for p in "$@"
  do
    find "$p" ! -type d
  done
}

# Using ldd, generate list of resolved library filepaths for each ELF binary and library, e.g.
# /usr/lib/libaio.so.1
# /lib/libblkid.so.1
find_lib_deps() {
  cat "$@" | sort -u | xargs -P $(nproc) -I '{}' ldd '{}' 2>/dev/null | sed -nr 's/^\s*(.*)=>\s*(.*?)\s.*$/\2/p' | sort -u
}

copy_libs() {
  mkdir -p $BUNDELF_CODE_PATH

  local BUNDELF_CODE_PATH_REGEX=$(escape_regex "$BUNDELF_CODE_PATH")

  # For each resolved library filepath, copy $file to the install location.
  #
  # N.B. These steps are all needed to ensure the Alpine dynamic linker can resolve library filepaths as required.
  #      For more, see https://www.musl-libc.org/doc/1.0.0/manual.html
  #
  grep -v "^$BUNDELF_CODE_PATH_REGEX" "$@" | sort -u | while read file
  do
    # Copy $file; and if $file is a symlink, also copy its target.
    # This could  result in duplicate copy operations if multiple symlinks point to the same target,
    # but has the advantage of simplicity.
    cp -a --parents $file $BUNDELF_CODE_PATH

    # If $file is a symlink, then copy its target too, as the target might not otherwise be copied.
    if [ -L "$file" ]; then
      # local target=$(realpath -m "$(dirname "$file")/$(readlink "$file")")
      local target=$(dirname "$file")/$(readlink "$file")
      cp -a --parents $target $BUNDELF_CODE_PATH
    fi

    if [ "$file" != "$LD_PATH" ]; then
      echo "$BUNDELF_CODE_PATH$file"
    fi
  done
}

patch_binary() {
  local bin="$1"

  if patchelf --set-interpreter $BUNDELF_EXEC_PATH$LD_PATH $bin 2>/dev/null; then
    echo patchelf --set-interpreter $BUNDELF_EXEC_PATH$LD_PATH $bin >>$TMP/patchelf.log
    return 0
  fi

  return 1
}

# Function to replace links with direct copies when using relative RPATHs.
# Only replaces links when source and target are in different directories,
# and thus need different RPATHs.
replace_link_new() {
  local file="$1"
  local tmp_file
  
  [ "$BUNDELF_LIBPATH_TYPE" = "relative" ] || return 0
  
  # Handle symlinks - only replace if target is in a different directory
  if [ -L "$file" ]; then
    local link_target=$(readlink "$file")
    local file_dir=$(dirname "$(realpath "$file")")
      
    if [ "${link_target#/}" = "$link_target" ]; then
        # Relative symlink: Resolve target relative to symlink location
        local target_full="$(cd "$(dirname "$file")" && realpath -m "$link_target")"
        local target_dir=$(dirname "$target_full")
    else
        # Absolute symlink: Already have full path
        local target_dir=$(dirname "$(realpath "$link_target")")
    fi
      
    if [ "$file_dir" != "$target_dir" ]; then
        tmp_file=$(mktemp)
        cp -L "$file" "$tmp_file" && mv "$tmp_file" "$file"
    fi
    return 0
  fi

  # Handle hard links - only replace if any hard link is in a different directory
  local link_count=$(stat -c %h "$file")
  if [ "$link_count" -gt 1 ]; then
    local file_dir=$(dirname "$file")
    local needs_replacement=0
    
    # Find all hard links to this inode and check their directories
    local inode=$(stat -c %i "$file")
    while IFS= read -r linked_file; do
      local linked_dir=$(dirname "$linked_file")
      if [ "$linked_dir" != "$file_dir" ]; then
        needs_replacement=1
        break
      fi
    done < <(find "$BUNDELF_CODE_PATH" -samefile "$file")

    if [ "$needs_replacement" -eq 1 ]; then
        tmp_file=$(mktemp)
        cp -dp "$file" "$tmp_file" && mv "$tmp_file" "$file"
    fi
  fi

  return 0
}

# Function to replace links with direct copies when using relative RPATHs
replace_link() {
    local file="$1"
    local tmp_file
    
    [ "$BUNDELF_LIBPATH_TYPE" = "relative" ] || return 0
    
    # Handle symlinks
    if [ -L "$file" ]; then
        tmp_file=$(mktemp)
        cp -L "$file" "$tmp_file" && mv "$tmp_file" "$file"
        return 0
    fi

    # Handle hard links
    # If the link count is greater than 1, the file is a hard link
    local link_count=$(stat -c %h "$file")
    if [ "$link_count" -gt 1 ]; then
        # Create a temporary copy of the file, and overwrite the original file with the non-hard-linked copy
        local tmp_file=$(mktemp)
        cp -dp "$file" "$tmp_file" && mv "$tmp_file" "$file"
    fi

    return 0
}

patch_binaries_interpreter() {
  # For all ELF binaries, set the interpreter to our own.
  for bin in $(sort -u "$@")
  do
    patch_binary "$bin" || exit 1
  done
}

generate_extra_system_lib_paths() {
  for p in "$@"
  do
    echo $p
  done 
}

escape_regex() {
  local s=$1 d=${2:-/}
  printf '%s' "$s" | sed -e "s/[][(){}.^\$*+?|\\\\$d]/\\\\&/g"
}

generate_system_lib_paths() {
  # Generate a list of system library paths
  # - This will be used to set the RPATH for all binaries and libraries to an absolute or relative path.
  # This list is generated by:
  # - Extracting the path to each library, relative to $BUNDELF_CODE_PATH; add leading '/' if missing.
  local BUNDELF_CODE_PATH_REGEX=$(escape_regex "$BUNDELF_CODE_PATH")

  cat "$@" | \
    grep -E '\.so(\.[0-9]+)*$' | \
    sed -r "s|^$BUNDELF_CODE_PATH_REGEX||; s|/[^/]+$||; s|^[^/]|/|;" | \
    grep -E '^(/usr|/lib)(/|$)' | \
    sort -u
}

generate_unique_rpath() {
  local prefix="$1"; shift

  local abs_syspaths  
  for s in $(sort -u "$@")
  do
    # Append each system path, prefixed with $prefix, and suffixed with a colon
    abs_syspaths="$abs_syspaths$(echo "$prefix${s}:")"
  done

  # Remove trailing colon
  echo $abs_syspaths | sed 's/:$//'
}

patch_binaries_and_libs_rpath() {
  # For all ELF libs, set the RPATH to our own, and force RPATH use.
  local p
  local rpath
  local rpath_template

  if [ "$BUNDELF_LIBPATH_TYPE" = "absolute" ]; then
    rpath_template=$(generate_unique_rpath "$BUNDELF_CODE_PATH" "$TMP/system-lib-paths")
  else
    rpath_template=$(generate_unique_rpath "\$ORIGIN" "$TMP/system-lib-paths")
  fi

  echo "BUNDELF_CODE_PATH: $BUNDELF_CODE_PATH" >>$TMP/patchelf.log
  echo "RPATH template: ${rpath_template@Q}" >>$TMP/patchelf.log

  local BUNDELF_CODE_PATH_REGEX=$(escape_regex "$BUNDELF_CODE_PATH")

  for lib in $(sort -u "$@")
  do

    if [ "$BUNDELF_LIBPATH_TYPE" = "absolute" ]; then
      rpath="$rpath_template"

      # Add node as a needed library to '.node' files, to avoid misleading ldd errors in verify()
      if [ -n "$BUNDELF_NODE_PATH" ] && echo "$lib" | grep -qE "\.node$"; then
        echo patchelf --add-needed "$BUNDELF_CODE_PATH$BUNDELF_NODE_PATH" $lib >>$TMP/patchelf.log
        patchelf --add-needed "$BUNDELF_CODE_PATH$BUNDELF_NODE_PATH" $lib >>$TMP/patchelf.log 2>&1 || exit 1
      fi

    else
      # If $lib is linked in different parts of the file hierarchy, then setting a relative RPATH on one file would break the correct RPATH set on another.
      # To prevent this, we un-hardlink any hardlinked files before we patch them.
      replace_link "$lib"

      p=$(dirname "$lib" | sed -r "s|^$BUNDELF_CODE_PATH_REGEX[/]+||; s|[^/]+|..|g")
      # rpath="\$ORIGIN/$p/lib:\$ORIGIN/$p/usr/lib:\$ORIGIN/$p/usr/lib/xtables"
      rpath="$(echo "$rpath_template" | sed "s|\$ORIGIN|\$ORIGIN/$p|g")"

      # Add node as a needed library to '.node' files, to avoid misleading ldd errors in verify()
      if [ -n "$BUNDELF_NODE_PATH" ] && echo "$lib" | grep -qE "\.node$"; then
        local NODE_DIR=$(dirname $BUNDELF_NODE_PATH)
        local NODE_BASENAME=$(basename $BUNDELF_NODE_PATH)

        # Augment rpath with relative path to the NODE_DIR
        rpath="$rpath:\$ORIGIN/$p$NODE_DIR"

        # Add a needed dynamic library dependency for NODE_BASENAME (will be searched for within the augmented rpath)
        echo patchelf --add-needed "$NODE_BASENAME" "$lib" >>$TMP/patchelf.log
        patchelf --add-needed "$NODE_BASENAME" "$lib" >>$TMP/patchelf.log 2>&1 || exit 1
      fi
    fi

    echo patchelf --force-rpath --set-rpath ${rpath@Q} "$lib" >>$TMP/patchelf.log
    patchelf --force-rpath --set-rpath \
      "$rpath" \
      "$lib" >>$TMP/patchelf.log 2>&1 || exit 1

    # Fail silently if patchelf fails to set the interpreter: this is a catch-all for libraries like /usr/lib/libcap.so.2
    # which strangely have an interpreter set.
    patch_binary "$lib"

  done
}

copy_and_scan_for_dynamics() {
  # Find all ELF files that are dynamically linked.
  # - This should includes all Theia .node files and spawn-helper, but not statically-linked binaries like 'rg'
  # - The only way to tell if a file is an ELF binary (or library) is to check the first 4 bytes for the magic byte sequence.

  mkdir -p $BUNDELF_CODE_PATH

  for q in "$@"
  do
    # Skip non-existent paths
    [ -d "$q" ] || continue

    tar cv "$q" 2>/dev/null | tar x -C $BUNDELF_CODE_PATH/

    find "$q" -type f ! -name '*.o' -print0 | xargs -0 -P $(nproc) -I '{}' hexdump -n 4 -e '4/1 "%2x" " {}\n"' {} | sed '/^7f454c46/!d; s/^7f454c46 //' | xargs -P $(nproc) file | grep dynamically
  done
}

get_dynamics_interpretable() {
  grep interpreter "$@" | cut -d':' -f1 | sed -r "s!^!$BUNDELF_CODE_PATH!"
}

get_dynamics_noninterpretable() {
  grep -v interpreter "$@" | cut -d':' -f1 | sed -r "s!^!$BUNDELF_CODE_PATH!"
}

write_digest() {
  # Prepare full and unique list of ELF binaries and libs for reference purposes and for checking
  sort -u $TMP/bins-copied >$BUNDELF_CODE_PATH/.binelfs
  sort -u $TMP/libs-copied >$BUNDELF_CODE_PATH/.libelfs
}

init() {
  for dep in file hexdump xargs patchelf
  do
    if ! [ -x "$(which $dep)" ]; then
      depsmissing=1
      echo "ERROR: Command '$dep' not found in PATH '$PATH'" >&2
    fi
  done

  [ -n "$depsmissing" ] && return 1

  # Initialise
  mkdir -p "$TMP"
  >$TMP/bins-copied
  >$TMP/libs-copied
  >$TMP/libs
  >$TMP/libs-extra
  >$TMP/libs-deps
  >$TMP/libs-new
  >$TMP/scanned-dynamics
  >$TMP/system-lib-paths
}

all() {
  # Copy elf binaries to BUNDELF_CODE_PATH and generate 'bins-copied' list of ELF binaries
  copy_binaries $BUNDELF_BINARIES >>$TMP/bins-copied

  # Scan for additional dynamic binaries and libs
  copy_and_scan_for_dynamics $BUNDELF_DYNAMIC_PATHS >>$TMP/scanned-dynamics

  # Add the intepretable dynamics to 'bins-copied'
  get_dynamics_interpretable $TMP/scanned-dynamics >>$TMP/bins-copied

  # Add the non-intepretable dynamics to 'libs'
  get_dynamics_noninterpretable $TMP/scanned-dynamics >>$TMP/libs-copied

  # Scan for extra libraries not formally declared as dependencies
  scan_extra_libs $BUNDELF_EXTRA_LIBS >>$TMP/libs-extra

  # Generate unique list of dynamic binaries and libs
  sort -u $TMP/bins-copied $TMP/libs-copied $TMP/libs-extra >>$TMP/libs

  # Iteratively find all library dependencies of libraries in 'libs', until no new libraries are found  
  while true
  do
    # Find library dependencies of libraries in 'libs'; write to 'libs-new'
    find_lib_deps $TMP/libs >>$TMP/libs-deps

    sort -u $TMP/libs $TMP/libs-deps >$TMP/libs-new

    if diff -q $TMP/libs $TMP/libs-new >/dev/null 2>&1; then
      break
    fi

    mv $TMP/libs-new $TMP/libs
  done

  # Copy libraries from 'libs' to BUNDELF_CODE_PATH and itemise new copied paths (overwriting previous incomplete 'libs-copied')
  copy_libs $TMP/libs >$TMP/libs-copied

  # Patch interpreter on all ELF binaries in 'bins-copied'
  patch_binaries_interpreter $TMP/bins-copied

  # Generate non-unique list of system library paths:
  generate_system_lib_paths $TMP/libs-copied >>$TMP/system-lib-paths
  generate_extra_system_lib_paths $BUNDELF_EXTRA_SYSTEM_LIB_PATHS >>$TMP/system-lib-paths

  # Patch RPATH on all binaries in 'bins-copied' and libs in 'libs-copied'
  patch_binaries_and_libs_rpath $TMP/bins-copied $TMP/libs-copied

  # Write a summary of binaries and libraries to BUNDELF_CODE_PATH
  write_digest

  # Copy LD and and create copnvenience symlink it to ld
  cp --parents $LD_PATH $BUNDELF_CODE_PATH
  ln -sf $(echo $LD_PATH | sed -r 's|^/lib/|./|') $BUNDELF_CODE_PATH/lib/ld
}

# Run with --verify from within any distribution, to check that all dynamic library dependencies
# are correctly being resolved to versions stored within BUNDELF_CODE_PATH.
if [ "$1" = "--verify" ]; then
  # Check the full list for any library dependencies being inadvertently resolved outside the install location.
  # Returns true if OK, false on any problems.
  init || exit 1
  verify
elif [ "$1" = "--bundle" ]; then
  init || exit 1
  all
  verify
fi


================================================
FILE: kernels/oraclelinux/95virtiofs/module-setup.sh
================================================
#!/usr/bin/bash

# called by dracut
check() {
   [[ $hostonly ]] || [[ $mount_needs ]] && {
       for fs in "${host_fs_types[@]}"; do
           [[ "$fs" == "virtiofs" ]] && return 0
       done
       return 255
   }

   is_qemu_virtualized && return 0

   return 255
}

# called by dracut
depends() {
   return 0
}

# called by dracut
installkernel() {
   instmods virtiofs

    # qemu specific modules
    hostonly='' instmods \
        ata_piix ata_generic pata_acpi cdrom sr_mod ahci \
        virtio_blk virtio virtio_ring virtio_pci \
        virtio_scsi virtio_console virtio_rng virtio_mem \
        virtio_net \
        spapr-vscsi \
        qemu_fw_cfg
}

# called by dracut
install() {
   inst_hook cmdline 95 "$moddir/parse-virtiofs.sh"
   inst_hook pre-mount 99 "$moddir/mount-virtiofs.sh"
}



================================================
FILE: kernels/oraclelinux/95virtiofs/mount-virtiofs.sh
================================================
#!/usr/bin/sh

type getarg >/dev/null 2>&1 || . /lib/dracut-lib.sh

filter_rootopts() {
   rootopts=$1
   # strip ro and rw options
   local OLDIFS="$IFS"
   IFS=,
   set -- $rootopts
   IFS="$OLDIFS"
   local v
   while [ $# -gt 0 ]; do
       case $1 in
           rw|ro);;
           defaults);;
           *)
               v="$v,${1}";;
       esac
       shift
   done
   rootopts=${v#,}
   echo $rootopts
}

mount_root() {
   local _ret

   rootfs="virtiofs"
   rflags="rw"

   modprobe virtiofs

   mount -t ${rootfs} -o "$rflags",ro "${root#virtiofs:}" "$NEWROOT"

   rootopts=
   if getargbool 1 rd.fstab -n rd_NO_FSTAB \
       && ! getarg rootflags \
       && [ -f "$NEWROOT/etc/fstab" ] \
       && ! [ -L "$NEWROOT/etc/fstab" ]; then
       # if $NEWROOT/etc/fstab contains special mount options for
       # the root filesystem,
       # remount it with the proper options
       rootopts="defaults"
       while read dev mp fs opts rest || [ -n "$dev" ]; do
           # skip comments
           [ "${dev%%#*}" != "$dev" ] && continue

           if [ "$mp" = "/" ]; then
               rootopts=$opts
               break
           fi
       done < "$NEWROOT/etc/fstab"

       rootopts=$(filter_rootopts $rootopts)
   fi

   # we want rootflags (rflags) to take precedence so prepend rootopts to
   # them; rflags is guaranteed to not be empty
   rflags="${rootopts:+${rootopts},}${rflags}"

   umount "$NEWROOT"

   info "Remounting ${root#virtiofs:} with -o ${rflags}"
   mount -t ${rootfs} -o "$rflags" "${root#virtiofs:}" "$NEWROOT" 2>&1 |
vinfo

   [ -f "$NEWROOT"/forcefsck ] && rm -f -- "$NEWROOT"/forcefsck 2>/dev/null
   [ -f "$NEWROOT"/.autofsck ] && rm -f -- "$NEWROOT"/.autofsck 2>/dev/null
}

if [ -n "$root" -a -z "${root%%virtiofs:*}" ]; then
   mount_root
fi
:


================================================
FILE: kernels/oraclelinux/95virtiofs/parse-virtiofs.sh
================================================
#!/usr/bin/sh

if [ "${root%%:*}" = "virtiofs" ] ; then
   modprobe virtiofs

   rootok=1
fi


================================================
FILE: kernels/oraclelinux/addvirtiofs.conf
================================================
add_dracutmodules+=" virtiofs "
filesystems+=" virtiofs "


================================================
FILE: patches/dnsmasq/remove-passwd-requirement.patch
================================================
--- a/src/dnsmasq.c.orig
+++ b/src/dnsmasq.c
@@ -481,6 +481,7 @@
     }
 #endif
   
+#if 0
   if (daemon->username && !(ent_pw = getpwnam(daemon->username)))
     baduser = daemon->username;
   else if (daemon->groupname && !(gp = getgrnam(daemon->groupname)))
@@ -488,6 +489,7 @@
 
   if (baduser)
     die(_("unknown user or group: %s"), baduser, EC_BADCONF);
+#endif
 
   /* implement group defaults, "dip" if available, or group associated with uid */
   if (!daemon->group_set && !gp)


================================================
FILE: patches/dropbear/runcvm.patch
================================================
--- a/src/cli-kex.c
+++ b/src/cli-kex.c
@@ -312,7 +312,7 @@
 	int ret;
 
 	if (cli_opts.no_hostkey_check) {
-		dropbear_log(LOG_INFO, "Caution, skipping hostkey check for %s\n", cli_opts.remotehost);
+		// dropbear_log(LOG_INFO, "Caution, skipping hostkey check for %s\n", cli_opts.remotehost);
 		return;
 	}
 
--- a/src/dbutil.c
+++ b/src/dbutil.c
@@ -140,7 +140,9 @@
 
 	vsnprintf(printbuf, sizeof(printbuf), format, param);
 
+#if 0
 	fprintf(stderr, "%s\n", printbuf);
+#endif
 
 }
 
--- a/src/default_options.h
+++ b/src/default_options.h
@@ -21,10 +21,10 @@
 /* Default hostkey paths - these can be specified on the command line.
  * Homedir is prepended if path begins with ~/
  */
-#define DSS_PRIV_FILENAME "/etc/dropbear/dropbear_dss_host_key"
-#define RSA_PRIV_FILENAME "/etc/dropbear/dropbear_rsa_host_key"
-#define ECDSA_PRIV_FILENAME "/etc/dropbear/dropbear_ecdsa_host_key"
-#define ED25519_PRIV_FILENAME "/etc/dropbear/dropbear_ed25519_host_key"
+#define DSS_PRIV_FILENAME "/.runcvm/dropbear/dropbear_dss_host_key"
+#define RSA_PRIV_FILENAME "/.runcvm/dropbear/dropbear_rsa_host_key"
+#define ECDSA_PRIV_FILENAME "/.runcvm/dropbear/dropbear_ecdsa_host_key"
+#define ED25519_PRIV_FILENAME "/.runcvm/dropbear/dropbear_ed25519_host_key"
 
 /* Set NON_INETD_MODE if you require daemon functionality (ie Dropbear listens
  * on chosen ports and keeps accepting connections. This is the default.
@@ -218,7 +218,7 @@
 #define DO_HOST_LOOKUP 0
 
 /* Whether to print the message of the day (MOTD). */
-#define DO_MOTD 1
+#define DO_MOTD 0
 #define MOTD_FILENAME "/etc/motd"
 
 /* Authentication Types - at least one required.


================================================
FILE: patches/mkinitfs/nlplug-findfs.patch
================================================
--- mkinitfs-3.8.1.orig/nlplug-findfs/nlplug-findfs.c
+++ mkinitfs-3.8.1/nlplug-findfs/nlplug-findfs.c
@@ -41,7 +41,7 @@
 #include 
 #include 
 
-#define MAX_EVENT_TIMEOUT	5000
+#define MAX_EVENT_TIMEOUT	1000
 #define DEFAULT_EVENT_TIMEOUT	250
 /* usb mass storage needs 1 sec to settle */
 #define USB_STORAGE_TIMEOUT	1000


================================================
FILE: patches/seabios/qemu-fw-cfg-fix.patch
================================================
diff --git a/src/sercon.c b/src/sercon.c
index 3019d9b..988c2a2 100644
--- a/src/sercon.c
+++ b/src/sercon.c
@@ -516,7 +516,7 @@ void sercon_setup(void)
     struct segoff_s seabios, vgabios;
     u16 addr;
 
-    addr = romfile_loadint("etc/sercon-port", 0);
+    addr = romfile_loadint("opt/org.seabios/etc/sercon-port", 0);
     if (!addr)
         return;
     dprintf(1, "sercon: using ioport 0x%x\n", addr);
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c
index fba4e52..9a346d9 100644
--- a/src/fw/paravirt.c
+++ b/src/fw/paravirt.c
diff --git a/src/fw/paravirt.c b/src/fw/paravirt.c
index fba4e52..9a346d9 100644
--- a/src/fw/paravirt.c
+++ b/src/fw/paravirt.c
@@ -652,9 +652,9 @@ void qemu_cfg_init(void)
     // serial console
     u16 nogfx = 0;
     qemu_cfg_read_entry(&nogfx, QEMU_CFG_NOGRAPHIC, sizeof(nogfx));
-    if (nogfx && !romfile_find("etc/sercon-port")
+    if (nogfx && !romfile_find("opt/org.seabios/etc/sercon-port")
         && !romfile_find("vgaroms/sgabios.bin"))
-        const_romfile_add_int("etc/sercon-port", PORT_SERIAL1);
+        const_romfile_add_int("opt/org.seabios/etc/sercon-port", PORT_SERIAL1);
 }
 
 /*


================================================
FILE: qemu-exit/qemu-exit.c
================================================
#include 
#include 
#include 
#include 

#define SHUTDOWN_PORT 0x604
#define EXIT_PORT     0x501

static void clean_exit(void) {
    ioperm(SHUTDOWN_PORT, 16, 1);
    outw(0x2000, SHUTDOWN_PORT);
}

int main(int argc, char **argv) {
    int status;

    if (argc != 2) {
        clean_exit();
    }

    status = atoi(argv[1]);
    if (!status) {
    	clean_exit();
    }

    ioperm(EXIT_PORT, 8, 1);

    // status returned is 1+(2*orig_status)
    outb(status-1, EXIT_PORT);

    // Didn't exit. Perhaps QEMU was not launched with -device isa-debug-exit
    exit(255);
}

================================================
FILE: runcvm-init/VERSION.h
================================================
// THIS FILE IS AUTOMATICALLY GENERATED
// Run `make VERSION.h` to update it after modifying VERSION.
unsigned char VERSION[] = {
  0x31, 0x2e, 0x32, 0x2e, 0x35, 0x0a
};
unsigned int VERSION_len = 6;


================================================
FILE: runcvm-init/dumb-init.c
================================================
// For the purposes of the following license, the "Software" is this file, dumb-init.c.
//
// The MIT License (MIT)
//
// Copyright (c) 2015 Yelp, Inc.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:

// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.

// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

// dumb-init.c modifications (c) 2022 NewsNow Publishing Limited

/*
 * dumb-init is a simple wrapper program designed to run as PID 1 and pass
 * signals to its children.
 *
 * Usage:
 *   ./dumb-init python -c 'while True: pass'
 *
 * To get debug output on stderr, run with '-v'.
 */

#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include 
#include "VERSION.h"

#define PRINTERR(...) do { \
    fprintf(stderr, "[runcvm-init] " __VA_ARGS__); \
} while (0)

#define DEBUG(...) do { \
    if (debug) { \
        PRINTERR(__VA_ARGS__); \
    } \
} while (0)

// Signals we care about are numbered from 1 to 31, inclusive.
// (32 and above are real-time signals.)
// TODO: this is likely not portable outside of Linux, or on strange architectures
#define MAXSIG 31

// Indices are one-indexed (signal 1 is at index 1). Index zero is unused.
// User-specified signal rewriting.
int signal_rewrite[MAXSIG + 1] = {[0 ... MAXSIG] = -1};
// One-time ignores due to TTY quirks. 0 = no skip, 1 = skip the next-received signal.
char signal_temporary_ignores[MAXSIG + 1] = {[0 ... MAXSIG] = 0};

pid_t child_pid = -1;
char debug = 0;
char use_setsid = 1;
char no_fork = 0;

int translate_signal(int signum) {
    if (signum <= 0 || signum > MAXSIG) {
        return signum;
    } else {
        int translated = signal_rewrite[signum];
        if (translated == -1) {
            return signum;
        } else {
            DEBUG("Translating signal %d to %d.\n", signum, translated);
            return translated;
        }
    }
}

void forward_signal(int signum) {
    signum = translate_signal(signum);
    if (signum != 0) {
        kill(use_setsid ? -child_pid : child_pid, signum);
        DEBUG("Forwarded signal %d to children.\n", signum);
    } else {
        DEBUG("Not forwarding signal %d to children (ignored).\n", signum);
    }
}

pid_t shutdown() {   
   pid_t my_child_pid;
   char *shutdown_cmd[] = {"/.runcvm/guest/scripts/runcvm-ctr-shutdown", NULL};
   
    my_child_pid = fork();
    if (my_child_pid < 0) {
        PRINTERR("Unable to fork. Exiting.\n");
        return 1;
    } else if (my_child_pid == 0) {
        /* child */
        DEBUG("Requesting child to shut down by spawning %s\n", shutdown_cmd[0]);
        execvp(shutdown_cmd[0], &shutdown_cmd[0]);

        // if this point is reached, exec failed, so we should exit nonzero
        PRINTERR("Shutdown child spawn failed: %s\n", strerror(errno));
        return 2;
    } else {
        /* parent */
        DEBUG("Shutdown child spawned with PID %d.\n", child_pid);
    }
   
    return my_child_pid;
}

void quit(int exit_status) {
    char exit_status_string[4];
    char *exit_cmd[] = {"/.runcvm/guest/scripts/runcvm-ctr-exit", exit_status_string, NULL};

    sprintf(exit_status_string, "%d", exit_status & 0xFF);

    DEBUG("Exiting by execing: %s %s\n", exit_cmd[0], exit_cmd[1]);
    execvp(exit_cmd[0], &exit_cmd[0]);
    DEBUG("Failed to exec %s, so exiting now with status %d\n", exit_cmd[0], exit_status);
    exit(exit_status);
}

/*
 * The dumb-init signal handler.
 *
 * The main job of this signal handler is to forward signals along to our child
 * process(es). In setsid mode, this means signaling the entire process group
 * rooted at our child. In non-setsid mode, this is just signaling the primary
 * child.
 *
 * In most cases, simply proxying the received signal is sufficient. If we
 * receive a job control signal, however, we should not only forward it, but
 * also sleep dumb-init itself.
 *
 * This allows users to run foreground processes using dumb-init and to
 * control them using normal shell job control features (e.g. Ctrl-Z to
 * generate a SIGTSTP and suspend the process).
 *
 * The libc manual is useful:
 * https://www.gnu.org/software/libc/manual/html_node/Job-Control-Signals.html
 *
*/
void handle_signal(int signum) {
    DEBUG("Received signal %d.\n", signum);

    if (signal_temporary_ignores[signum] == 1) {
        DEBUG("Ignoring tty hand-off signal %d.\n", signum);
        signal_temporary_ignores[signum] = 0;
    } else if (signum == SIGTERM) {
        shutdown();
    } else if (signum == SIGCHLD) {
        int status, exit_status;
        pid_t killed_pid;
        while ((killed_pid = waitpid(-1, &status, WNOHANG)) > 0) {
            if (WIFEXITED(status)) {
                exit_status = WEXITSTATUS(status);
                DEBUG("A child with PID %d exited with exit status %d.\n", killed_pid, exit_status);
            } else {
                assert(WIFSIGNALED(status));
                exit_status = 128 + WTERMSIG(status);
                DEBUG("A child with PID %d was terminated by signal %d.\n", killed_pid, exit_status - 128);
            }

            if (killed_pid == child_pid) {
                forward_signal(SIGTERM);  // send SIGTERM to any remaining children
                DEBUG("Child exited with status %d. Goodbye.\n", exit_status);
                quit(exit_status);
                // exit(exit_status);
            }
        }
    } else {
        forward_signal(signum);
        if (signum == SIGTSTP || signum == SIGTTOU || signum == SIGTTIN) {
            DEBUG("Suspending self due to TTY signal.\n");
            kill(getpid(), SIGSTOP);
        }
    }
}

void print_help(char *argv[]) {
    fprintf(stderr,
        "runcvm-init v%.*s"
        "Usage: %s [option] command [[arg] ...]\n"
        "\n"
        "runcvm-init is a simple process supervisor that forwards signals to children.\n"
        "It is designed to run as PID1 in minimal container environments.\n"
        "\n"
        "Optional arguments:\n"
        "   -c, --single-child   Run in single-child mode.\n"
        "                        In this mode, signals are only proxied to the\n"
        "                        direct child and not any of its descendants.\n"
        "   -r, --rewrite s:r    Rewrite received signal s to new signal r before proxying.\n"
        "                        To ignore (not proxy) a signal, rewrite it to 0.\n"
        "                        This option can be specified multiple times.\n"
        "   -v, --verbose        Print debugging information to stderr.\n"
        "   -h, --help           Print this help message and exit.\n"
        "   -V, --version        Print the current version and exit.\n"
        "   -F, --no-fork        Don't fork, just set up signals and tty\n"
        "\n",
        VERSION_len, VERSION,
        argv[0]
    );
}

void print_rewrite_signum_help() {
    fprintf(
        stderr,
        "Usage: -r option takes :, where  "
        "is between 1 and %d.\n"
        "This option can be specified multiple times.\n"
        "Use --help for full usage.\n",
        MAXSIG
    );
    exit(1);
}

void parse_rewrite_signum(char *arg) {
    int signum, replacement;
    if (
        sscanf(arg, "%d:%d", &signum, &replacement) == 2 &&
        (signum >= 1 && signum <= MAXSIG) &&
        (replacement >= 0 && replacement <= MAXSIG)
    ) {
        signal_rewrite[signum] = replacement;
    } else {
        print_rewrite_signum_help();
    }
}

void set_rewrite_to_sigstop_if_not_defined(int signum) {
    if (signal_rewrite[signum] == -1) {
        signal_rewrite[signum] = SIGSTOP;
    }
}

char **parse_command(int argc, char *argv[]) {
    int opt;
    struct option long_options[] = {
        {"help",         no_argument,       NULL, 'h'},
        {"single-child", no_argument,       NULL, 'c'},
        {"rewrite",      required_argument, NULL, 'r'},
        {"verbose",      no_argument,       NULL, 'v'},
        {"version",      no_argument,       NULL, 'V'},
        {"no-fork",      no_argument,       NULL, 'F'},
        {NULL,                     0,       NULL,   0},
    };
    while ((opt = getopt_long(argc, argv, "+hvVcFr:", long_options, NULL)) != -1) {
        switch (opt) {
            case 'h':
                print_help(argv);
                exit(0);
            case 'v':
                debug = 1;
                break;
            case 'V':
                fprintf(stderr, "dumb-init v%.*s", VERSION_len, VERSION);
                exit(0);
            case 'c':
                use_setsid = 0;
                break;
            case 'r':
                parse_rewrite_signum(optarg);
                break;
            case 'F':
                no_fork = 1;
                break;
            default:
                exit(1);
        }
    }

    if (optind >= argc) {
        fprintf(
            stderr,
            "Usage: %s [option] program [args]\n"
            "Try %s --help for full usage.\n",
            argv[0], argv[0]
        );
        exit(1);
    }

    char *debug_env = getenv("DUMB_INIT_DEBUG");
    if (debug_env && strcmp(debug_env, "1") == 0) {
        debug = 1;
        DEBUG("Running in debug mode.\n");
    }

    char *setsid_env = getenv("DUMB_INIT_SETSID");
    if (setsid_env && strcmp(setsid_env, "0") == 0) {
        use_setsid = 0;
        DEBUG("Not running in setsid mode.\n");
    }

    if (use_setsid) {
        set_rewrite_to_sigstop_if_not_defined(SIGTSTP);
        set_rewrite_to_sigstop_if_not_defined(SIGTTOU);
        set_rewrite_to_sigstop_if_not_defined(SIGTTIN);
    }

    return &argv[optind];
}

// A dummy signal handler used for signals we care about.
// On the FreeBSD kernel, ignored signals cannot be waited on by `sigwait` (but
// they can be on Linux). We must provide a dummy handler.
// https://lists.freebsd.org/pipermail/freebsd-ports/2009-October/057340.html
void dummy(int signum) {}

int main(int argc, char *argv[]) {
    char **cmd = parse_command(argc, argv);
    sigset_t all_signals;
    sigfillset(&all_signals);
    sigprocmask(SIG_BLOCK, &all_signals, NULL);

    int i = 0;
    for (i = 1; i <= MAXSIG; i++) {
        signal(i, dummy);
    }

    /*
     * Detach dumb-init from controlling tty, so that the child's session can
     * attach to it instead.
     *
     * We want the child to be able to be the session leader of the TTY so that
     * it can do normal job control.
     */
    if (use_setsid) {
        if (ioctl(STDIN_FILENO, TIOCNOTTY) == -1) {
            DEBUG(
                "Unable to detach from controlling tty (errno=%d %s).\n",
                errno,
                strerror(errno)
            );
        } else {
            /*
             * When the session leader detaches from its controlling tty via
             * TIOCNOTTY, the kernel sends SIGHUP and SIGCONT to the process
             * group. We need to be careful not to forward these on to the
             * dumb-init child so that it doesn't receive a SIGHUP and
             * terminate itself (#136).
             */
            if (getsid(0) == getpid()) {
                DEBUG("Detached from controlling tty, ignoring the first SIGHUP and SIGCONT we receive.\n");
                signal_temporary_ignores[SIGHUP] = 1;
                signal_temporary_ignores[SIGCONT] = 1;
            } else {
                DEBUG("Detached from controlling tty, but was not session leader.\n");
            }
        }
    }

    if(no_fork) {
        child_pid = 0;
    }
    else {
        child_pid = fork();
    }
    
    if (child_pid < 0) {
        PRINTERR("Unable to fork. Exiting.\n");
        return 1;
    } else if (child_pid == 0) {
        /* child */
        sigprocmask(SIG_UNBLOCK, &all_signals, NULL);
        if (use_setsid) {
            // Don't throw error if setsid() fails in no_fork mode;
            // we don't want this to prevent startup.
            if (setsid() == -1 && !no_fork) {
                PRINTERR(
                    "Unable to setsid (errno=%d %s). Exiting.\n",
                    errno,
                    strerror(errno)
                );
                exit(1);
            }

            if (ioctl(STDIN_FILENO, TIOCSCTTY, 0) == -1) {
                DEBUG(
                    "Unable to attach to controlling tty (errno=%d %s).\n",
                    errno,
                    strerror(errno)
                );
            }
            DEBUG("setsid complete.\n");
        }
        execvp(cmd[0], &cmd[0]);

        // if this point is reached, exec failed, so we should exit nonzero
        PRINTERR("%s: %s\n", cmd[0], strerror(errno));
        return 2;
    } else {
        /* parent */
        DEBUG("Child spawned with PID %d.\n", child_pid);
        if (chdir("/") == -1) {
             DEBUG("Unable to chdir(\"/\") (errno=%d %s)\n",
                   errno,
                   strerror(errno));
        }
        for (;;) {
            int signum;
            sigwait(&all_signals, &signum);
            handle_signal(signum);
        }
    }
}


================================================
FILE: runcvm-scripts/functions/cgroupfs
================================================
cgroupfs_mount() {
  local cgroupfs="$1"

  # We want no cgroupfs at all, or we will leave it to the distribution.
  if [[ "$cgroupfs" = "none" || "$cgroupfs" = "systemd" ]]; then
    return
  fi

  # If defined in fstab, or there's no kernel support, skip.
  # see also https://github.com/tianon/cgroupfs-mount/blob/master/cgroupfs-mount
  if grep -v '^#' /etc/fstab | grep -q cgroup \
    || [ ! -e /proc/cgroups ] \
    || [ ! -d /sys/fs/cgroup ]; then
      return
  fi

  # If hybrid, mixed, or cgroup1 cgroup support is requested...
  if [[ "$cgroupfs" = "hybrid" || "$cgroupfs" = "mixed" || "$cgroupfs" = "1" || "$cgroupfs" = "cgroup1" ]]; then

    if ! findmnt -rnu -M /sys/fs/cgroup; then
      mount -t tmpfs -o uid=0,gid=0,mode=0755 cgroup /sys/fs/cgroup
    fi
    
    for subtype in $(awk '!/^#/ { if ($4 == 1) print $1 }' /proc/cgroups); do
      local sys="/sys/fs/cgroup/$subtype"
      mkdir -p $sys
      if ! findmnt -rnu -M $sys; then
        if ! mount -n -t cgroup -o $subtype cgroup $sys; then
          rmdir $sys || true
        fi
      fi
    done

  fi

  # If hybrid or mixed cgroup support is requested...
  if [[ "$cgroupfs" = "hybrid" || "$cgroupfs" = "mixed" ]]; then
    if ! findmnt -rnu -M /sys/fs/cgroup/unified; then
      mkdir -p /sys/fs/cgroup/unified
      mount -t cgroup2 -o rw,nosuid,nodev,noexec,relatime cgroup2 /sys/fs/cgroup/unified
    fi
  fi

  # If purely cgroup2 cgroup support is requested...
  if [[ "$cgroupfs" = "2" || "$cgroupfs" = "cgroup2" ]]; then
    if ! findmnt -rnu -M /sys/fs/cgroup; then
      mkdir -p /sys/fs/cgroup
      mount -t cgroup2 -o rw,nosuid,nodev,noexec,relatime cgroup2 /sys/fs/cgroup
    fi
  fi
}

================================================
FILE: runcvm-scripts/runcvm-ctr-defaults
================================================
#!/bin/bash

RUNCVM_GUEST=${RUNCVM_GUEST:-/.runcvm/guest}
RUNCVM_PATH=$RUNCVM_GUEST/usr/sbin:$RUNCVM_GUEST/usr/bin:$RUNCVM_GUEST/sbin:$RUNCVM_GUEST/bin:$RUNCVM_GUEST/usr/lib/qemu

QEMU_VIRTIOFSD_SOCKET=/run/.virtiofs.sock
QEMU_GUEST_AGENT=/run/.qemu-guest-agent
QEMU_MONITOR_SOCKET=/run/.qemu-monitor-socket

SSHD_PORT=22222

clean_env() {
  export -n \
  RUNCVM_BREAK RUNCVM_INIT \
  RUNCVM_GUEST \
  RUNCVM_RUNTIME_DEBUG \
  RUNCVM_BIOS_DEBUG RUNCVM_BIOS \
  RUNCVM_DISPLAY_MODE \
  RUNCVM_QEMU_DEBUG RUNCVM_QEMU_ARCH RUNCVM_QEMU_DISPLAY RUNCVM_QEMU_VGA RUNCVM_QEMU_VNC_DISPLAY RUNCVM_QEMU_USB RUNCVM_QEMU_NET_VHOST RUNCVM_QEMU_MEM_PREALLOC \
  RUNCVM_KERNEL RUNCVM_KERNEL_ROOT RUNCVM_KERNEL_APPEND RUNCVM_KERNEL_INITRAMFS_PATH RUNCVM_KERNEL_DEBUG RUNCVM_KERNEL_PATH \
  RUNCVM_DISKS \
  RUNCVM_UIDGID RUNCVM_VM_MOUNTPOINT RUNCVM_TMPFS \
  RUNCVM_CPUS RUNCVM_MEM_SIZE RUNCVM_HUGETLB \
  RUNCVM_HAS_HOME \
  RUNCVM_VIRTIOFSD_CACHE \
  RUNCVM_CGROUPFS

  # May be set in VM by busybox init process
  export -n USER
}

load_network() {
  local if="${1:-default}"
  [ -d /.runcvm/network/devices ] && [ -s /.runcvm/network/devices/$if ] || return 1
  read -r DOCKER_IF DOCKER_IF_MAC DOCKER_IF_MTU DOCKER_IF_IP DOCKER_IF_IP_NETPREFIX DOCKER_IF_IP_GW /.runcvm/entrypoint

# SET HOME ENV VAR IF NEEDED

# - See https://github.com/moby/moby/issues/2968#issuecomment-35822318
#   for details of how Docker sets HOME.
#
# - What this means is that:
#   1. if HOME is defined in the image and
#      docker run:
#      a. does not define HOME
#         - config.json process.env[] will show the image-defined value and this value will be used
#         - docker exec
#           - does not define HOME, then process.json env[] will show the image-defined value and this value will be used
#           - does define HOME, then process.json env[] will show the exec-defined value and this value will be used
#      b. does define HOME, config.json process.env[] will show the docker run-defined value and this value will be used
#         - docker exec
#           - does not define HOME, then process.json env[] will show the docker run-defined value and this value will be used
#           - does define HOME, then process.json env[] will show the exec-defined value and this value will be used
#   (the above is irrespective of -u setting)
#
#   2. if HOME is not defined in the image and
#      docker run:
#      a. does not define HOME
#         - config.json process.env[] will show no HOME value and the user's default homedir will be used
#         - docker exec
#           - does not define HOME, then process.json env[] will show no HOME value and the user's default homedir will be used
#           - does define HOME, then process.json env[] will show the exec-defined value and this value will be used
#      b. does define HOME, config.json process.env[] will show the docker run-defined value and this value will be used
#         - docker exec
#           - does not define HOME, then process.json env[] will show the docker run-defined value and this value will be used
#           - does define HOME, then process.json env[] will show the exec-defined value and this value will be used

# Problem in 2a for us with docker run and docker exec is that while we save the requested uid:gid, we set the actual uid:gid to 0:0
# to allow us to run virtiofsd (and, today, qemu) (in the docker run case) and access the qemu guest agent socket (in the docker exec case - though use of the agent is deprecated in favour of ssh).
#
# Where HOME is not explicitly defined, this leads to docker setting HOME to root's default homedir (typically /root),
# for the calls to runcvm-ctr-entrypoint and runcvm-ctr-exec (respectively).
#
# How then do we distinguish this case from the case where HOME is explicitly set to /root?
# The answer is that runcvm-runtime must check for HOME in env[] and indicate its presence in the calls to runcvm-ctr-entrypoint and runcvm-ctr-exec.
#
# runcvm-runtime does this:
# - in the docker run case, via the RUNCVM_HAS_HOME env var
# - in the docker exec case, via an argument to runcvm-ctr-exec

# Here we check RUNCVM_HAS_HOME to determine whether the HOME env var was set either in the image, or via docker run.
# If not, then we set HOME to the requested user's default homedir in accordance with https://github.com/moby/moby/issues/2968.

if [ "$RUNCVM_HAS_HOME" == "0" ]; then
  HOME=$($RUNCVM_GUEST/usr/bin/getent passwd "${RUNCVM_UIDGID%%:*}" | $RUNCVM_GUEST/bin/cut -d':' -f6)
fi

if [ -z "$RUNCVM_CPUS" ] || [ "$RUNCVM_CPUS" -le 0 ]; then
  RUNCVM_CPUS=$($RUNCVM_GUEST/bin/busybox nproc)
fi

# SAVE ENVIRONMENT
export -n SHLVL OLDPWD

export >/.runcvm/config

# NOW LOAD DEFAULT ENV AND PATH
. $RUNCVM_GUEST/scripts/runcvm-ctr-defaults

# LOAD IP MANIPULATION FUNCTIONS
. $RUNCVM_GUEST/scripts/runcvm-ip-functions

# SAVE PWD
busybox pwd >/.runcvm/pwd

# DEBUG
if [[ "$RUNCVM_BREAK" =~ prenet ]]; then bash; fi

# SAVE NETWORKING CONFIG AND CONFIGURE BRIDGES

# Identify default gateway device and IP address
read -r DOCKER_GW_IF DOCKER_GW_IF_IP < \
  <(ip -json route show | jq -r '.[] | (select(.dst == "default") | [.dev, .gateway]) | @tsv')
# e.g. eth0 172.25.10.1

QEMU_BRIDGE_IP=169.254.1.1
RUNCVM_DNS_IP=169.254.169.254

mkdir -p /.runcvm/network/devices

# Save non-link-scope non-default routes for later restoration in the running VM.
ip -json route show | jq -r '.[] | select(.scope != "link" and .dst != "default") | "\(.dst) \(.gateway) \(.dev) \(.prefsrc)"' >/.runcvm/network/routes

for if in $(ip -json link show | jq -r '.[] | .ifname')
do

  [ "$if" = "lo" ] && continue

  read -r DOCKER_IF_IP DOCKER_IF_IP_NETPREFIX DOCKER_IF_MAC DOCKER_IF_MTU < \
    <(ip -json addr show "$if" | jq -r '.[0] | [.addr_info[0].local, .addr_info[0].prefixlen, .address, .mtu] | @tsv')
  # e.g. 172.25.10.2 24 52:54:00:b7:0b:b6 1500

  # Save container network parameters
  if [ "$if" = "$DOCKER_GW_IF" ]; then
    printf "%s %s %s %s %s %s\n" \
      "$if" "$DOCKER_IF_MAC" "$DOCKER_IF_MTU" "$DOCKER_IF_IP" "$DOCKER_IF_IP_NETPREFIX" "$DOCKER_GW_IF_IP" \
      >/.runcvm/network/devices/$if
    ln -s "$if" /.runcvm/network/devices/default
  else
    printf "%s %s %s %s %s %s\n" \
      "$if" "$DOCKER_IF_MAC" "$DOCKER_IF_MTU" "$DOCKER_IF_IP" "$DOCKER_IF_IP_NETPREFIX" "-" \
      >/.runcvm/network/devices/$if
  fi

  # RECONFIGURE CONTAINER NETWORK
  ip addr flush dev "$if"

  QEMU_BRIDGE="br-$if"

  # Create the container bridge
  # See https://bugs.launchpad.net/neutron/+bug/1738659
  ip link add "$QEMU_BRIDGE" type bridge forward_delay 0 ageing 0

  # Add the original container interface to the bridge and bring it up.
  ip link set dev "$if" master "$QEMU_BRIDGE"
  ip link set dev "$if" up

  # Bring the bridge up.
  ip link set dev "$QEMU_BRIDGE" up

  # Restore network route via this bridge
  DOCKER_NET=$(ip_prefix_to_network "$DOCKER_IF_IP" "$DOCKER_IF_IP_NETPREFIX")/"$DOCKER_IF_IP_NETPREFIX"
  ip route add "$DOCKER_NET" dev "$QEMU_BRIDGE"

  # If this interface is the default gateway interface, perform additional special steps.
  if [ "$if" = "$DOCKER_GW_IF" ]; then

    # Add a private IP to this bridge.
    # We need it so the bridge can receive traffic, but the IP won't ever see the light of day.
    ip addr add "$QEMU_BRIDGE_IP" dev "$QEMU_BRIDGE"

    # Restore default gateway route via this bridge.
    ip route add default via "$DOCKER_GW_IF_IP" dev "$QEMU_BRIDGE"

    # Accept DNS requests for $RUNCVM_DNS_IP; these will be passed to dnsmasq
    XTABLES_LIBDIR=$RUNCVM_GUEST/usr/lib/xtables xtables-nft-multi iptables -t nat -A PREROUTING -d "$RUNCVM_DNS_IP/32" -p udp -m udp --dport 53 -j REDIRECT

    # Accept VNC requests on published port (5900 + display number)
    if [[ "$RUNCVM_DISPLAY_MODE" = "vnc" ]] || is_natural_int "$RUNCVM_QEMU_VNC_DISPLAY"; then
      XTABLES_LIBDIR=$RUNCVM_GUEST/usr/lib/xtables xtables-nft-multi iptables -t nat -A PREROUTING -p tcp -m tcp --dport $((RUNCVM_QEMU_VNC_DISPLAY+5900)) -j REDIRECT
    fi

    # Match UDP port 53 traffic, outgoing via the QEMU bridge, from the bridge's own IP:
    # -> Masquerade as if from the VM's IP.
    #    This allows outgoing DNS requests from the VM to be received by dnsmasq running in the container.
    XTABLES_LIBDIR=$RUNCVM_GUEST/usr/lib/xtables xtables-nft-multi iptables -t nat -A POSTROUTING -o "$QEMU_BRIDGE" -s "$QEMU_BRIDGE_IP/32" -p udp -m udp --sport 53 -j SNAT --to-source "$DOCKER_IF_IP"
    XTABLES_LIBDIR=$RUNCVM_GUEST/usr/lib/xtables xtables-nft-multi iptables -t nat -A POSTROUTING -o "$QEMU_BRIDGE" -s "$QEMU_BRIDGE_IP/32" -p udp -m udp --dport 53 -j SNAT --to-source "$DOCKER_IF_IP"

    # Match traffic on TCP port $SSHD_PORT, outgoing via the QEMU bridge, from the bridge's own IP:
    # -> Masquerade it as if from the DNS_IP.
    #    This is necessary to allow SSH from within the container to the VM.
    XTABLES_LIBDIR=$RUNCVM_GUEST/usr/lib/xtables xtables-nft-multi iptables -t nat -A POSTROUTING -o "$QEMU_BRIDGE" -s "$QEMU_BRIDGE_IP/32" -p tcp -m tcp --dport "$SSHD_PORT" -j SNAT --to-source "$RUNCVM_DNS_IP"
  fi

done

# FIXME: Bind-mount /etc/resolv.conf as well as /vm/etc/resolv.conf to prevent them showing in 'docker diff'
cat /vm/etc/resolv.conf >/etc/resolv.conf
RESOLV_CONF_NEW=$(busybox sed -r "s/127.0.0.11/$RUNCVM_DNS_IP/" /vm/etc/resolv.conf)
echo "$RESOLV_CONF_NEW" >/vm/etc/resolv.conf

# LAUNCH DNSMASQ
# It will receive local DNS requests (within the container, on 127.0.0.1)
# and requests redirected locally (via the iptables PREROUTING REDIRECT rule) for $RUNCVM_DNS_IP.
dnsmasq -u root --no-hosts

# LAUNCH VIRTIOFSD
$RUNCVM_GUEST/scripts/runcvm-ctr-virtiofsd &

# DEBUG
if [[ "$RUNCVM_BREAK" =~ postnet ]]; then bash; fi

# LAUNCH INIT SUPERVISING QEMU
# FIXME: Add -v to debug
exec $RUNCVM_GUEST/sbin/runcvm-init -c $RUNCVM_GUEST/scripts/runcvm-ctr-qemu


================================================
FILE: runcvm-scripts/runcvm-ctr-exec
================================================
#!/.runcvm/guest/bin/bash -e

# See https://qemu-project.gitlab.io/qemu/interop/qemu-ga-ref.html

# Load original environment
. /.runcvm/config

# Load defaults and aliases
. $RUNCVM_GUEST/scripts/runcvm-ctr-defaults

env() {
  busybox env "$@"
}

to_bin() {
  # tab, LF, space, ', ", \
  tr "\011\012\040\047\042\134" '\200\201\202\203\204\205'
}

# Expects:
# - To be run as root
# - To be given env vars
# - To be given arguments
#   $1 ::
#   $2 
#   $3 
#   $4 
#   $(5...)  

command="$RUNCVM_GUEST/scripts/runcvm-vm-exec"
uidgid="$1"
cwd="$2"
hasHome="$3"
wantsTerminal="$4"
shift 4

# Parse uidgid and construct args array for the call to $command within the VM:
# $1 
# $2 
# $3 
# $(4...)  

IFS=':' read -r uid gid additionalGids <<< "$uidgid"
args=("$@")

if [ ${#args[@]} -gt 0 ]; then
  args_bin=$(printf '%s\n' "${args[@]}" | to_bin)
fi

# If the HOME env var was not set either in the image, or via docker run, or via docker exec,
# then set HOME to the requested user's default homedir.
#
# - See runcvm-ctr-entrypoint for full details of how/why hasHome is needed and HOME gets set.

if [ "$hasHome" != "1" ]; then
  # Either this script needs to look up uid's HOME or else runcvm-vm-exec does; for now, we do it here.
  HOME=$(getent passwd "$uid" | cut -d':' -f6)
fi

# Clean RUNCVM env vars
clean_env

# N.B. Only exported env vars will be returned and sent
mapfile -t env < <(env -u _ -u SHLVL -u PWD)

if [ ${#env[@]} -gt 0 ]; then
  env_bin=$(printf '%s\n' "${env[@]}" | to_bin)
fi

if [ "$wantsTerminal" = "true" ]; then
  opts=(-t)
fi

retries=30 # 15 seconds
delay=0 # Signal that extra time should be allowed for RunCVM VM, its init and its dropbear sshd to start after the above conditions are satisfied

while ! [ -s /.runcvm/dropbear/key ] || ! load_network
do
  if [ $retries -gt 0 ]; then
    retries=$((retries-1))
    delay=1
    sleep 0.5
    continue
  fi

  echo "Error: RunCVM container not yet started" >&2
  exit 1
done

# If startup was detected, wait a few extra seconds for dropbear sshd to be ready
if [ "$delay" -ne 0 ]; then
  sleep 2
fi

exec $RUNCVM_GUEST/usr/bin/dbclient "${opts[@]}" -p $SSHD_PORT -y -y -i /.runcvm/dropbear/key root@$DOCKER_IF_IP "$command '$uidgid' '$(echo -n $cwd | to_bin)' '$args_bin' '$env_bin'"

================================================
FILE: runcvm-scripts/runcvm-ctr-exit
================================================
#!/.runcvm/guest/bin/bash

# Load original environment
. /.runcvm/config

# Load defaults and aliases
. $RUNCVM_GUEST/scripts/runcvm-ctr-defaults

# runcvm-init execs this script when it exits.
# It:
# - performs any post-VM tests.
# - retrieves any saved exit code.
# - resets terminal readline horizontal scroll
# - exits with exit code

if [ -f /.runcvm/exitcode ]; then
  read CODE /dev/null

exit ${CODE:-0}

================================================
FILE: runcvm-scripts/runcvm-ctr-qemu
================================================
#!/.runcvm/guest/bin/bash

# Exit on errors
set -o errexit -o pipefail

# Load original environment
. /.runcvm/config

# Load defaults
. $RUNCVM_GUEST/scripts/runcvm-ctr-defaults && unset PATH

QEMU_IFUP="$RUNCVM_GUEST/scripts/runcvm-ctr-qemu-ifup"
QEMU_IFDOWN="$RUNCVM_GUEST/scripts/runcvm-ctr-qemu-ifdown"

INIT="init=$RUNCVM_GUEST/scripts/runcvm-vm-init"

# Must export TERMINFO so curses library can find terminfo database.
export TERMINFO="$RUNCVM_GUEST/usr/share/terminfo"

error() {
  echo "$1" >&2
  exit 1
}

# Argument e.g. /volume/disk1,/var/lib/docker,ext4,5G
do_disk() {
  local spec="$1"
  local id="$2"
  local src dst fs size dir UUID queues

  local IFS=','
  read src dst fs size <<< $(echo "$spec")

  if [[ -z "$src" || -z "$dst" || -z "$fs" ]]; then
    error "Error: disk spec '$spec' invalid: src, dst and fs must all be specified"
  fi

  if [[ "$src" = "$dst" ]]; then
    error "Error: disk spec '$spec' invalid: src '$src' cannot be same as dst"
  fi

  if [[ -e "$src" && ! -f "$src" ]]; then
    error "Error: disk spec '$spec' invalid: src '$src' must be a plain file if it exists"
  fi

  if [[ -e "$dst" && ! -d "$dst" ]]; then
    error "Error: disk spec '$spec' invalid: dst '$dst' must be a directory if it exists"
  fi

  if [[ ! -f "$src" ]]; then
    
    if [[ -z "$size" ]]; then
      error "Error: disk spec '$spec' invalid: size must be specified if src '$src' does not exist"
    fi

    # Create directory for disk backing file, if needed.
    dir="$(busybox dirname "$src")"
    if ! [ -d "$dir" ]; then
      mkdir -p $(busybox dirname "$src")
    fi

    # Create disk backing file.
    busybox truncate -s "$size" "$src" >&2 || error "Error: disk spec '$spec' invalid: truncate on '$src' with size '$size' failed"

    # Create filesystem on disk backing file, populated with any pre-existing files from dst.
    [ -d "$RUNCVM_VM_MOUNTPOINT/$dst" ]|| mkdir -p "$RUNCVM_VM_MOUNTPOINT/$dst" >&2
    mke2fs -q -F -t "$fs" -d "$RUNCVM_VM_MOUNTPOINT/$dst" "$src" >&2 || error "Error: disk spec '$spec' invalid: mke2fs on '$src' with fs '$fs' failed"
  fi

  # Create the mountpoint, if we haven't already/it didn't already exist.
  [ -d "$RUNCVM_VM_MOUNTPOINT/$dst" ]|| mkdir -p "$RUNCVM_VM_MOUNTPOINT/$dst" >&2

  # Obtain a UUID for the filesystem and add to fstab.
  read -r UUID <<< $(blkid -o value "$src")
  echo "UUID=$UUID $dst $fs defaults,noatime 0 0" >>/.runcvm/fstab

  # Add disk to QEMU command line.

  if [ "${RUNCVM_CPUS:-1}" -gt 8 ]; then
    queues=8
  else
    queues="${RUNCVM_CPUS:-1}"
  fi

  DISKS+=(
    # Create a block backend (no implicit frontend device attached)
    # -drive id=drv$id       : give this backend a unique identifier
    # file=$src              : source image file
    # if=none                : do not attach directly, will be bound later with -device
    # format=raw             : treat the file as raw block device
    # media=disk             : marks this as a disk, not a CD-ROM
    # cache=none             : use O_DIRECT, bypass host page cache, rely on guest flushes for durability
    # aio=io_uring           : use Linux io_uring for asynchronous I/O (lower overhead than libaio)
    # discard=unmap          : propagate guest TRIM/UNMAP to host (space reclamation)
    # detect-zeroes=unmap    : convert guest zero writes into UNMAPs for efficiency
    -drive id=drv$id,file=$src,if=none,format=raw,media=disk,cache=none,aio=io_uring,discard=unmap,detect-zeroes=unmap

    # Create a dedicated I/O thread (dataplane) to offload block I/O
    # -object iothread,id=ioth$id : defines an iothread with a unique identifier
    -object iothread,id=ioth$id

    # Attach a virtio-blk PCI frontend to the backend and iothread
    # -device virtio-blk-pci     : create a virtio block device on PCI bus
    # drive=drv$id               : connect to backend created above
    # iothread=ioth$id           : process requests in dedicated I/O thread
    # queue-size=1024            : set depth of each virtqueue
    # num-queues=4               : create 4 queues, allowing parallel I/O across vCPUs
    -device virtio-blk-pci,drive=drv$id,iothread=ioth$id,queue-size=1024,num-queues=$queues
  )
}

# Argument e.g. /disk1,/home,ext4,5G;/disk2,/var,ext4,1G
do_disks() {
  local IFS=';'
  local disk
  local id=0
  for disk in $1
  do
    do_disk "$disk" "$id"
    id=$((id+1))
  done
}

do_networks() {
  local id=0 ifpath if mac vhost
  local DOCKER_IF DOCKER_IF_MAC DOCKER_IF_MTU DOCKER_IF_IP DOCKER_IF_IP_NETPREFIX DOCKER_IF_IP_GW

  for ifpath in /.runcvm/network/devices/*
  do
    if=$(busybox basename "$ifpath")

    [ "$if" = "default" ] && continue

    load_network "$if"

    mac=$(busybox sed -r 's/^..:..:../52:54:00/' <<<$DOCKER_IF_MAC)

    if [ "$RUNCVM_QEMU_NET_VHOST" = "1" ]; then
      vhost="on"
    else
      vhost="off"
    fi

    IFACES+=(
        -netdev tap,id=qemu$id,ifname=tap-$DOCKER_IF,script=$QEMU_IFUP,downscript=$QEMU_IFDOWN,vhost=$vhost
        -device virtio-net-pci,netdev=qemu$id,mac=$mac,rombar=$id
    )

    id=$((id+1))
  done
}

DISKS=()
if [ -n "$RUNCVM_DISKS" ]; then
  do_disks "$RUNCVM_DISKS"
fi

IFACES=()
do_networks

if [ -n "$RUNCVM_TMPFS" ]; then
  echo "$RUNCVM_TMPFS" >>/.runcvm/fstab
fi

if [[ -z "$RUNCVM_CPUS" || "$RUNCVM_CPUS" -le 0 ]]; then
  RUNCVM_CPUS=$(busybox nproc)
fi

# TODO:
# - Consider using '-device pvpanic'

if [ "$RUNCVM_QEMU_ARCH" = "arm64" ]; then
  CMD="$(which qemu-system-aarch64)"
  MACHINE+=(-cpu max -machine virt,gic-version=max,usb=off)
else
  CMD="$(which qemu-system-x86_64)"
  MACHINE+=(-enable-kvm)
  MACHINE+=(-cpu host,pmu=off)

  # QEMU machine configuration:
  # - q35: modern Intel chipset with PCIe and UEFI support
  # - accel=kvm: enable KVM for near-native performance (requires hardware virtualization)
  # - usb=on: enable USB support (e.g., for peripherals or boot devices)
  # - sata=off: disable SATA controller if using virtio/NVMe instead
  # - vmport=off: disable VMware backdoor port for improved security/stealth
  # - smm=on: enable System Management Mode (required for UEFI Secure Boot)
  # - dump-guest-core=off: disable saving core dumps on guest crash (saves disk space)
  # - hpet=off: disable High Precision Event Timer to avoid timing issues in some guests
  # - kernel_irqchip=on: enable kernel-based interrupt controller for better performance

  # - usb=on: enable USB support when requested
  if [ "$RUNCVM_QEMU_USB" = "on" ]; then
    USB="on"
  else
    USB="off"
  fi

  MACHINE+=(-machine q35,accel=kvm,sata=off,vmport=off,smm=on,dump-guest-core=off,hpet=off,usb=$USB,kernel_irqchip=on)

  # Enable the QEMU debug exit device for debugging purposes.
  # This device allows the guest to trigger a debug exit, which can be useful for debugging
  MACHINE+=(-device isa-debug-exit)

  # Set the virtual hardware clock to UTC time and enable drift correction using slewing.
  # This ensures the VM clock starts in UTC and gradually adjusts for time drift,
  # providing smooth and accurate timekeeping without sudden changes.
  MACHINE+=(-rtc base=utc,driftfix=slew)
fi

# RUNCVM_QEMU_DISPLAY -- the display frontend: none, curses
# RUNCVM_QEMU_VGA -- the backend VGA video device: none, std, virtio, cirrus
# RUNCVM_QEMU_VNC_DISPLAY -- the VNC display number (where enabled): 0, 1, ...

CONSOLE=()
DISPLAY=()

case "${RUNCVM_DISPLAY_MODE:-headless}" in
  headless) # virtconsole / hvc0
    RUNCVM_QEMU_DISPLAY="${RUNCVM_QEMU_DISPLAY:-none}"
    RUNCVM_QEMU_VGA="${RUNCVM_QEMU_VGA:-none}"

    # Creates a stdio backend connected to the virtual console.
    # Use with /dev/hvc0
    CONSOLE+=(
      -chardev stdio,id=char0,mux=off,signal=off
      -device virtconsole,chardev=char0,id=console0
    )

    CONSOLE_DEV="hvc0"
    NOMODESET="1"
    ;;
  serial) # stdio + monitor / ttyS0
    RUNCVM_QEMU_DISPLAY="${RUNCVM_QEMU_DISPLAY:-none}"
    RUNCVM_QEMU_VGA="${RUNCVM_QEMU_VGA:-none}"

    # Creates a multiplexed stdio backend connected to the serial port (and the QEMU monitor).
    # Use with /dev/ttyS0
    CONSOLE+=(
      -chardev stdio,id=char0,mux=on,signal=off
      -serial chardev:char0 -mon chardev=char0
    )

    # Set monitor escape key to CTRL-T to reduce risk of conflict (as default, CTRL-A, is  commonly used)
    CONSOLE+=(-echr 20)
    CONSOLE_DEV="ttyS0"
    NOMODESET="1"
    ;;
  vnc) # VNC + tty0
    RUNCVM_QEMU_DISPLAY="${RUNCVM_QEMU_DISPLAY:-none}"
    RUNCVM_QEMU_VGA="${RUNCVM_QEMU_VGA:-virtio}"
    RUNCVM_QEMU_VNC_DISPLAY="${RUNCVM_QEMU_VNC_DISPLAY:-0}"

    CONSOLE_DEV="tty0"
    NOMODESET="0"
    ;;
  *)
    error "Error: RUNCVM_DISPLAY_MODE '$RUNCVM_DISPLAY_MODE' invalid, must be one of: default|headless, serial, vnc" ;;
esac

# Save choice of console device
echo "$CONSOLE_DEV" >/.runcvm/console

# Generate QEMU display options
if [ -n "$RUNCVM_QEMU_DISPLAY" ]; then
  DISPLAY+=(-display $RUNCVM_QEMU_DISPLAY)
fi

if [ -n "$RUNCVM_QEMU_VGA" ]; then

  # i.e. virtio, std, none
  DISPLAY+=(-vga $RUNCVM_QEMU_VGA)

  if [ "$RUNCVM_QEMU_VGA" != "none" ]; then
    NOMODESET="0" # Disable nomodeset if using a graphical console
    DISPLAY+=(-device virtio-tablet-pci)
    DISPLAY+=(-audiodev none,id=snd0 -device intel-hda -device hda-output,audiodev=snd0)

    # For now, unless/until `-device virtio-gpu-pci` support is provided,
    # the VNC display is only relevant if VGA is not 'none'
    if [ -n "$RUNCVM_QEMU_VNC_DISPLAY" ]; then
      DISPLAY+=(-vnc :$RUNCVM_QEMU_VNC_DISPLAY,password=off)
    fi
  fi
fi

# Append nomodeset unless using a graphical console (i.e. not serial or virtconsole)
# PROBABLY ONLY NEEDED FOR `-display curses` (RUNCVM_QEMU_DISPLAY=curses) CASE
if [ "$NOMODESET" = "1" ]; then
  APPEND+=(nomodeset)
fi

if [ "$RUNCVM_BIOS_DEBUG" != "1" ]; then
  # Disable SeaBIOS serial console.
  # This -cfw_cfg path is modified from the SeaBIOS default (to avoid an otherwise-inevitable QEMU
  # warning being emitted) and so requires patched bios.bin file(s) (see Dockerfile)
  OPTS+=(-fw_cfg opt/org.seabios/etc/sercon-port,string=0)
fi

if [ "$RUNCVM_BIOS" = "EFI" ]; then
  # UEFI firmware
  OPTS+=(
    # -drive if=pflash,format=raw,unit=0,readonly=on,file=$RUNCVM_GUEST/usr/share/OVMF/OVMF_CODE.fd
    # -drive if=pflash,format=raw,unit=1,readonly=on,file=$RUNCVM_GUEST/usr/share/OVMF/OVMF_VARS.fd
    -bios $RUNCVM_GUEST/usr/share/OVMF/OVMF.fd
  )
fi

MEM_BACKEND=(-numa node,memdev=mem)
if [ "$RUNCVM_HUGETLB" != "1" ]; then
  # Tests suggests prealloc=on slows down mem-path=/dev/shm
  MEM_PATH="/dev/shm" 
  MEM_BACKEND+=(-object memory-backend-file,id=mem,size=$RUNCVM_MEM_SIZE,mem-path=$MEM_PATH,share=on,prealloc=${RUNCVM_QEMU_MEM_PREALLOC:-off})
else
  # Fastest performance: +15% CPU/net intensive; 3.5x disk intensive.
  MEM_BACKEND+=(-object memory-backend-memfd,id=mem,size=$RUNCVM_MEM_SIZE,share=on,prealloc=${RUNCVM_QEMU_MEM_PREALLOC:-off},hugetlb=on)
fi

# 16-64 works well and is more performant than 1024 in some scenarios.
# For now, stick with original figure.
VIRTIOFS_QUEUE_SIZE=1024
VIRTIOFS+=(
  -chardev socket,id=virtiofs,path=$QEMU_VIRTIOFSD_SOCKET
  -device vhost-user-fs-pci,queue-size=$VIRTIOFS_QUEUE_SIZE,chardev=virtiofs,tag=runcvmfs,ats=off
)

# Experimental: Enable to specify a dedicated PCI bridge
# OPTS+=(-device pci-bridge,bus=pcie.0,id=pci-bridge-0,chassis_nr=1,shpc=off,addr=2,io-reserve=4k,mem-reserve=1m,pref64-reserve=1m)

# Experimental: Enable for a SCSI bus
# OPTS+=(-device virtio-scsi-pci,id=scsi0,disable-modern=true)

# Disable IPv6, which is currently unsupported, at kernel boot time
APPEND+=(ipv6.disable=1 panic=-1)

# Disable unneeded functionality
APPEND+=(scsi_mod.scan=none tsc=reliable no_timer_check rcupdate.rcu_expedited=1 i8042.direct=1 i8042.dumbkbd=1 i8042.nopnp=1 i8042.noaux=1 noreplace-smp reboot=k cryptomgr.notests pci=lastbus=0 selinux=0)

# Enable systemd startup logging by default:
# - Only effective when --env=RUNCVM_KERNEL_DEBUG=1
# - Override this by launching with --env='RUNCVM_KERNEL_APPEND=systemd.show_status=0'
APPEND+=(systemd.show_status=1)

if [ "$RUNCVM_KERNEL_DEBUG" = "1" ]; then
  APPEND+=(console=$CONSOLE_DEV)
else
  APPEND+=(quiet)
fi

ARGS=(
  -no-user-config
  -nodefaults
  -no-reboot

  -action panic=none
  -action reboot=shutdown

  "${MACHINE[@]}"
  "${DISPLAY[@]}"
  "${OPTS[@]}"

  # N.B. There is a counterintuitive relationship between cpus and memory, and performance:
  # - more cpus needs more memory to maintain the same virtiofs disk I/O performance.
  -m "$RUNCVM_MEM_SIZE"
  -smp $RUNCVM_CPUS,cores=1,threads=1,sockets=$RUNCVM_CPUS,maxcpus=$RUNCVM_CPUS

  # Creates a virtio-serial bus on the PCI bus; this is used for the guest agent and virtiofs
  -device virtio-serial-pci,id=serial0 

  # Creates an RNG on the PCI bus
  -object rng-random,id=rng0,filename=/dev/urandom -device virtio-rng-pci,rng=rng0

  # Memory backend
  "${MEM_BACKEND[@]}"

  # virtiofs socket and interface
  "${VIRTIOFS[@]}"

  # Configure host/container tap device with PXE roms disabled
  "${IFACES[@]}"
  "${DISKS[@]}"

  # Configure console
  "${CONSOLE[@]}"

  # Support for guest agent
  -chardev socket,id=qemuguest0,path=$QEMU_GUEST_AGENT,server=on,wait=off
  -device virtserialport,chardev=qemuguest0,name=org.qemu.guest_agent.0

  # Creates a unix socket for the QEMU monitor
  -monitor unix:$QEMU_MONITOR_SOCKET,server,nowait

  # Kernel and initrd and kernel cmdline
  -kernel $RUNCVM_KERNEL_PATH
  -initrd $RUNCVM_KERNEL_INITRAMFS_PATH
  -L $RUNCVM_GUEST/usr/share/qemu
  -append "$RUNCVM_KERNEL_ROOT $INIT rw ${APPEND[*]} $RUNCVM_KERNEL_APPEND"
)

if [[ "$RUNCVM_BREAK" =~ preqemu ]]; then echo "Preparing to run: '$CMD' ${ARGS[@]@Q}"; bash; fi

if [ "$RUNCVM_QEMU_DEBUG" = "1" ]; then
  "$CMD" "${ARGS[@]}" || echo "QEMU exited with code: $?" >&2
  echo "QEMU memory cgroup events:" >&2
  cat /sys/fs/cgroup/memory.events >&2
else
exec "$CMD" "${ARGS[@]}"
fi

================================================
FILE: runcvm-scripts/runcvm-ctr-qemu-ifdown
================================================
#!/.runcvm/guest/bin/bash

# Load original environment
. /.runcvm/config

# Load defaults and aliases
. $RUNCVM_GUEST/scripts/runcvm-ctr-defaults

ip link set dev "$1" down || true
exit 0


================================================
FILE: runcvm-scripts/runcvm-ctr-qemu-ifup
================================================
#!/.runcvm/guest/bin/bash

# Load original environment
. /.runcvm/config

# Load defaults and aliases
. $RUNCVM_GUEST/scripts/runcvm-ctr-defaults

tap="$1"
if="$(busybox sed 's/tap-//' <<<$tap)"
bri="$(busybox sed 's/tap-/br-/' <<<$tap)"

load_network "$if"

ip link set dev "$tap" up mtu "${DOCKER_IF_MTU:=1500}" master "$bri"

exit 0

================================================
FILE: runcvm-scripts/runcvm-ctr-qemu-poweroff
================================================
#!/.runcvm/guest/bin/bash

# Load original environment
. /.runcvm/config

# Load defaults and aliases
. $RUNCVM_GUEST/scripts/runcvm-ctr-defaults

echo "system_powerdown" | nc -w 1 -U $QEMU_MONITOR_SOCKET

================================================
FILE: runcvm-scripts/runcvm-ctr-shutdown
================================================
#!/.runcvm/guest/bin/bash

# runcvm-init forks and execs this script when it receives a SIGTERM

# Load original environment
. /.runcvm/config

poweroff() {
  # Try ACPI poweroff
  $RUNCVM_GUEST/scripts/runcvm-ctr-qemu-poweroff
  
  # Try running busybox poweroff
  $RUNCVM_GUEST/scripts/runcvm-ctr-exec 0:0 / 0 0 $RUNCVM_GUEST/bin/poweroff &>/dev/null

  # Try killing the VM's PID 1
  $RUNCVM_GUEST/scripts/runcvm-ctr-exec 0:0 / 0 0 $RUNCVM_GUEST/bin/busybox kill 1 &>/dev/null
}

poweroff

exit 0


================================================
FILE: runcvm-scripts/runcvm-ctr-virtiofsd
================================================
#!/.runcvm/guest/bin/bash

# Load defaults and aliases
. /.runcvm/guest/scripts/runcvm-ctr-defaults

if [ "$RUNCVM_SYS_ADMIN" = "1" ]; then
  OPTS+=(-o modcaps=+sys_admin)
fi

VIRTIOFSD_CACHE=${RUNCVM_VIRTIOFSD_CACHE:-auto}
OPTS+=(-o "cache=$VIRTIOFSD_CACHE")

# Send logs to /run in container (not in VM)
exec "$(which virtiofsd)" "${OPTS[@]}" -o announce_submounts -o xattr --socket-path=$QEMU_VIRTIOFSD_SOCKET -o source=$RUNCVM_VM_MOUNTPOINT -o sandbox=chroot >/run/.virtiofsd.log 2>&1


================================================
FILE: runcvm-scripts/runcvm-install-runtime.sh
================================================
#!/bin/sh

RUNCVM=/opt/runcvm
RUNCVM_LD=$RUNCVM/lib/ld
RUNCVM_JQ=$RUNCVM/usr/bin/jq
MNT=/runcvm
REPO=${REPO:-newsnowlabs/runcvm}

log() {
    echo "$@"
}

jq() {
  $RUNCVM_LD $RUNCVM_JQ "$@"
}

jq_set() {
  local file="$1"
  shift
  
  local tmp="/tmp/$$.json"

  if jq "$@" $file >$tmp; then
    mv $tmp $file
  else
    echo "Failed to update $(basename $file); aborting!" 2>&1
    exit 1
  fi
}

jq_get() {
  local file="$1"
  shift
  
  jq -r "$@" $file
}

usage() {
  cat <<_EOE_ >&2

Usage: sudo $0
_EOE_
  exit 1
}

check_rp_filter() {
  # For RunCVM to work, the following condition on /proc/sys/net/ipv4/conf/ must be met:
  # - the max of all/rp_filter and /rp_filter should be 0 or 2
  #   (where  is the bridge underpinning the Docker network to which RunCVM instances will be attached)
  #
  # This means that:
  # - if all/rp_filter is set to 0, then /rp_filter must be set to 0 or 2
  #   (or, if  is not yet or might not yet have been created, then default/rp_filter must be set to 0 or 2)
  # - if all/rp_filter is set to 1, then /rp_filter must be set to 2
  #   (or, if  is not yet or might not yet have been created, then default/rp_filter must be set to 2)
  # - if all/rp_filter is set to 2, then no further action is needed

  local rp_filter_all rp_filter_default

  log "- Checking rp_filter ..."

  if [ -f "/proc/sys/net/ipv4/conf/all/rp_filter" ]; then
    rp_filter_all=$(cat /proc/sys/net/ipv4/conf/all/rp_filter)
  else
    log "  - Warning: could not find /proc/sys/net/ipv4/conf/all/rp_filter"
  fi

  if [ -f "/proc/sys/net/ipv4/conf/default/rp_filter" ]; then
    rp_filter_default=$(cat /proc/sys/net/ipv4/conf/default/rp_filter)
  else
    log "  - Warning: could not find /proc/sys/net/ipv4/conf/default/rp_filter"
  fi

  if [ -z "$rp_filter_all" ] || [ -z "$rp_filter_default" ]; then
    return
  fi
  
  if [ "$rp_filter_all" = "2" ]; then
    log "  - sys.net.ipv4.conf.all.rp_filter is set to 2; assuming no further action needed"
    return
  elif [ "$rp_filter_all" = "0" ] && [ "$rp_filter_default" = "0" ]; then
    log "  - sys.net.ipv4.conf.all.rp_filter AND sys.net.ipv4.conf.default.rp_filter are set to 0; assuming no further action needed"
    return
  fi
  
  log "  - sys.net.ipv4.conf.all.rp_filter is set to $rp_filter_all; fixing ..."
  log "  - Setting sys.net.ipv4.conf.all.rp_filter and Setting sys.net.ipv4.conf.default.rp_filter to 2 ..."
  echo 2 >/proc/sys/net/ipv4/conf/all/rp_filter
  echo 2 >/proc/sys/net/ipv4/conf/default/rp_filter

  log "  - Patching /etc/sysctl.conf, /etc/sysctl.d/* to make these settings persist after reboot ..."
  find /etc/sysctl.conf /etc/sysctl.d -type f -exec sed -r -i 's/^([ ]*net.ipv4.conf.(all|default).rp_filter)=(1)$/# DISABLED BY RUNCVM\n# \1=\3\n# ADDED BY RUNCVM\n\1=2/' {} \;
}

docker_restart() {
  # docker_restart
  # - With systemd, run: systemctl restart docker
  # - On GitHub Codespaces, run: sudo killall dockerd && sudo /usr/local/share/docker-init.sh

  local cmd init
  
  init=$(ps -o comm,pid 1 | grep ' 1$' | awk '{print $1}')

  log "  - Preparing to restart dockerd ..."

  if [ "$init" = "systemd" ]; then
    log "    - Detected systemd"
    cmd="systemctl restart docker"

  elif [ -x "/etc/init.d/docker" ]; then
    log "    - Detected sysvinit"
    cmd="/etc/init.d/docker restart"

  elif [ "$init" = "docker-init" ]; then

    if [ -x "/usr/local/share/docker-init.sh" ]; then
      log "    - Detected docker-init on GitHub Codespaces"
      cmd="killall dockerd && /usr/local/share/docker-init.sh"
    fi
  fi

  if [ -n "$cmd" ]; then
    log "    - Preparing to run: $cmd"
    read -p "    - Run this? (Y/n): " yesno

    if [ "$yesno" != "${yesno#[Yy]}" ] || [ -z "$yesno" ]; then
      log "    - Restarting dockerd with: $cmd"
      sh -c "$cmd" 2>&1 | sed 's/^/      - /'

      # Wait for dockerd to restart
      log "    - Waiting for dockerd to restart ..."
      while ! docker ps >/dev/null 2>&1; do
        sleep 0.5
      done
      log "    - Restarted dockerd successfully"

    else
      log "    - Please restart dockerd manually in the usual manner for your system"
    fi

  else
    log "  - Couldn't detect restart mechanism for dockerd, please restart manually in the usual manner for your system"
  fi
}

log
log "RunCVM Runtime Installer"
log "========================"
log

if [ $(id -u) -ne 0 ]; then
  log "- Error: $0 must be run as root. Please relaunch using sudo."
  usage
fi

for app in docker dockerd
do
  if [ -z $(which docker) ]; then
    log "- Error: $0 currently requires the '$app' binary; please install it and try again"
    usage
  fi
done


if [ "$1" = "--no-dockerd" ]; then
  NO_DOCKERD="1"
  log "- Skipping dockerd check and docker-based package install due to '--no-dockerd'"
  shift
else
  log "- Checking dockerd ..."
  if docker info >/dev/null 2>&1; then
    log "  - Detected running dockerd"
  else
    log "  - Error: dockerd not running; please start dockerd; aborting!"
    exit 1
  fi
fi

# Install RunCVM package to $MNT
if [ -z "$NO_DOCKERD" ]; then
  log "- Installing RunCVM package to $MNT ..."
  if docker run --rm -v /opt/runcvm:$MNT $REPO --quiet; then
    log "- Installed RunCVM package to /opt/runcvm"
  else
    log "- Failed to install RunCVM package to /opt/runcvm; aborting!"
    exit 1
  fi
fi

if [ -d "/etc/docker" ]; then

  log "- Detected /etc/docker"

  if ! [ -f "/etc/docker/daemon.json" ]; then
    log "  - Creating empty daemon.json"
    echo '{}' >/etc/docker/daemon.json
  fi

  if [ $(jq_get "/etc/docker/daemon.json" ".runtimes.runcvm.path") != "/opt/runcvm/scripts/runcvm-runtime" ]; then
    log "  - Adding runcvm to daemon.json runtimes property ..."

    if jq_set  "/etc/docker/daemon.json" '.runtimes.runcvm.path |= "/opt/runcvm/scripts/runcvm-runtime"'; then
      log "    - Done"
    else
      log "    - Failed: $!"
      exit 1
    fi

    # Attempt restart of dockerd
    # (if dockerd not found, we'll just continue)
    docker_restart

  else
    log "  - Valid runcvm property already found in daemon.json"
  fi

  if docker info 2>/dev/null | grep -q runcvm; then
  # if [ $(docker info --format '{{ json .Runtimes.runcvm }}') = "{"path":"/opt/runcvm/scripts/runcvm-runtime"}" ]; then
    log "  - Verification of RunCVM runtime in Docker completed"
  else
    log "  - Warning: could not verify RunCVM runtime in Docker; perhaps you need to restart Docker manually"
  fi

else
  log "- No /etc/docker detected; your mileage with RunCVM without Docker may vary!"
fi

if [ -n "$(which podman)" ]; then
  log "- Detected podman binary"
  cat <<_EOE_ >&2
  - To enable experimental RunCVM support for Podman, add the following
    to /etc/containers/containers.conf in the [engine.runtimes] section:

    runcvm = [ "/opt/runcvm/scripts/runcvm-runtime" ]
_EOE_
fi

# Check, correct and make persistent required rp_filter settings
check_rp_filter

log "- RunCVM installation/upgrade complete."
log

================================================
FILE: runcvm-scripts/runcvm-ip-functions
================================================
#!/bin/bash

cidr_to_int() {
  echo "$(( 0xffffffff ^ ((1 << (32 - $1)) - 1) ))"
}

int_to_ip() {
  local value="$1"
  echo "$(( ($1 >> 24) & 0xff )).$(( ($1 >> 16) & 0xff )).$(( ($1 >> 8) & 0xff )).$(( $1 & 0xff ))"
}

cidr_to_netmask() {
  local value=$(cidr_to_int "$1")
  int_to_ip "$value"
}

ip_prefix_to_network() {
  local IFS i1 i2 i3 i4 m1 m2 m3 m4
  IFS=. read -r i1 i2 i3 i4 <<< "$1"

  local mask=$(cidr_to_netmask "$2")
  IFS=. read -r m1 m2 m3 m4 <<< "$mask"

  printf "%d.%d.%d.%d\n" "$((i1 & m1))" "$((i2 & m2))" "$((i3 & m3))" "$((i4 & m4))"
}

cidr_to_bcastmask() {
  local value=$(( (1 << 32) - $(cidr_to_int "$1") - 1 ))
  int_to_ip "$value"
}

ip_prefix_to_bcast() {
  local IFS i1 i2 i3 i4 m1 m2 m3 m4
  IFS=. read -r i1 i2 i3 i4 <<< "$1"

  local mask=$(cidr_to_bcastmask "$2")
  IFS=. read -r m1 m2 m3 m4 <<< "$mask"

  printf "%d.%d.%d.%d\n" "$((i1 | m1))" "$((i2 | m2))" "$((i3 | m3))" "$((i4 | m4))"
}

is_natural_int() {
    case $1 in
        ''|*[!0-9]*) return 1 ;;  # not numeric
        *) return 0 ;;            # numeric and > 0
    esac
}

================================================
FILE: runcvm-scripts/runcvm-runtime
================================================
#!/opt/runcvm/lib/ld-musl-x86_64.so.1 /opt/runcvm/bin/bash

# REFERENCES

# Qemu:
# - https://github.com/joshkunz/qemu-docker
# - https://mergeboard.com/blog/2-qemu-microvm-docker/
# - https://github.com/BBVA/kvm

# Virtiofs
# - https://vmsplice.net/~stefan/virtio-fs_%20A%20Shared%20File%20System%20for%20Virtual%20Machines.pdf
# - https://virtio-fs.gitlab.io/howto-qemu.html
# - https://www.tauceti.blog/posts/qemu-kvm-share-host-directory-with-vm-with-virtio/

# Container config.json spec
# - https://github.com/opencontainers/runtime-spec/
# - https://github.com/opencontainers/runtime-spec/blob/main/config.md

# Mount namespaces
# - https://www.kernel.org/doc/Documentation/filesystems/sharedsubtree.txt
# - https://www.redhat.com/sysadmin/mount-namespaces

RUNCVM=/opt/runcvm
RUNCVM_LD=$RUNCVM/lib/ld
RUNCVM_JQ=$RUNCVM/usr/bin/jq
RUNCVM_VM_MOUNTPOINT="/vm"
RUNCVM_GUEST=/.runcvm/guest
RUNCVM_ENTRYPOINT=$RUNCVM_GUEST/scripts/runcvm-ctr-entrypoint
RUNCVM_EXEC="$RUNCVM_GUEST/scripts/runcvm-ctr-exec"
RUNCVM_KERNELS=$RUNCVM/kernels
RUNCVM_GUEST_KERNELS=$RUNCVM_GUEST/kernels
RUNCVM_KERNEL_DEFAULT=debian
RUNCVM_MEM_SIZE_DEFAULT="512" # Default VM memory size (Mb) if not specified in container config
RUNCVM_CONTAINER_MEM_OVERHEAD="256" # Memory (Mb) overhead to allow for QEMU, virtiofsd, dnsmasq and other container memory footprint: 128Mb minimum, 256Mb recommended
RUNCVM_DEBUG=""

debug() {
  [ -n "$RUNCVM_DEBUG" ] && true || false
}

log() {
  debug && echo "$(date '+%Y-%m-%d %H:%M:%S.%6N'): $$: $@" >>/tmp/runcvm-$$.log
}

error() {

  # Skip past any docker error ending in CR
  (echo; echo) >&2

  # Dump message to stderr
  echo "RunCVM: Error: $1" >&2

  # Dump error also to the logfile
  log "RunCVM: Error: $1"
  exit -1
}

load_env_from_file() {
  local file="$1"
  local var="$2"

  # Return gracefully if no $file exists
  if ! [ -f "$file" ]; then
    return 0
  fi

  while read LINE
  do
    local name="${LINE%%=*}"
    local value="${LINE#*=}"
    
    if [ "$name" != "$LINE" ] && [ "$value" != "$LINE" ] && [ "$name" = "$var" ]; then
      # We found variable $name: return it, removing any leading/trailing double quotes
      echo "$value" | sed 's/^"//;s/"$//'
      return 0
    fi
  done <"$file"
  
  return 1
}

jq() {
  $RUNCVM_LD $RUNCVM_JQ "$@"
}

jq_set() {
  local file="$1"
  shift
  
  local tmp="/tmp/config.json.$$"

  if jq "$@" $file >$tmp; then
    mv $tmp $file
  else
    echo "Failed to update $(basename $file); aborting!" 2>&1
    exit 1
  fi
}

jq_get() {
  local file="$1"
  shift
  
  jq -r "$@" $file
}

get_process_env() {
  local file="$1"
  local var="$2"
  local default="$3"
  local value
  
  value=$(jq_get "$file" --arg env "$var" '.env[] | select(match("^" + $env + "=")) | match("^" + $env + "=(.*)") | .captures[] | .string')
  
  [ -n "$value" ] && echo -n "$value" || echo -n "$default"
}

get_process_env_boolean() {
  local file="$1"
  local var="$2"
  local value
  
  value=$(jq_get "$file" --arg env "$var" '.env[] | select(match("^" + $env + "=")) | match("^" + $env + "=(.*)") | .captures[] | .string')
  
  [ -n "$value" ] && echo "1" || echo "0"
}

get_config_env() {
  local var="$1"
  local default="$2"
  local value

  value=$(jq_get "$CFG" --arg env "$var" '.process.env[] | select(match("^" + $env + "=")) | match("^" + $env + "=(.*)") | .captures[] | .string')
  
  [ -n "$value" ] && echo -n "$value" || echo -n "$default"
}

set_config_env() {
  local var="$1"
  local value="$2"
  
  jq_set "$CFG" --arg env "$var=$value" '.process.env |= (.+ [$env] | unique)'
}


# PARSE RUNC GLOBAL OPTIONS:
# --debug             enable debug logging
# --log value         set the log file to write runc logs to (default is '/dev/stderr')
# --log-format value  set the log format ('text' (default), or 'json') (default: "text")
# --root value        root directory for storage of container state (this should be located in tmpfs) (default: "/run/user/1000/runc")
# --criu value        path to the criu binary used for checkpoint and restore (default: "criu")
# --systemd-cgroup    enable systemd cgroup support, expects cgroupsPath to be of form "slice:prefix:name" for e.g. "system.slice:runc:434234"
# --rootless value    ignore cgroup permission errors ('true', 'false', or 'auto') (default: "auto")

COMMAND_LINE=("$@")

if debug; then
  log "Command line: $0 ${COMMAND_LINE[@]@Q}"
fi

while true
do
  case "$1" in
    --debug|--systemd-cgroup) shift; continue; ;;
    --log|--log-format|--root|--criu|--rootless) shift; shift; continue; ;;
    --log=*|--log-format=*|--root=*|--criu=*|--rootless=*) shift; continue; ;;
    *) break; ;;
  esac
done

COMMAND="$1"
shift

if [ "$COMMAND" = "create" ]; then

  debug && log "Command: create"
  
  # USAGE:
  #    runc create [command options] 
  #   
  # PARSE 'create' COMMAND OPTIONS
  # --bundle value, -b value  path to the root of the bundle directory, defaults to the current directory
  # --console-socket value    path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal
  # --pid-file value          specify the file to write the process id to
  # --no-pivot                do not use pivot root to jail process inside rootfs.  This should be used whenever the rootfs is on top of a ramdisk
  # --no-new-keyring          do not create a new session keyring for the container.  This will cause the container to inherit the calling processes session key
  # --preserve-fds value      Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0)
  while true
  do
    case "$1" in
      --bundle|-b) shift; BUNDLE="$1"; shift; continue; ;;
      --console-socket|--pid-file|--preserve-fds) shift; shift; continue; ;;
      --no-pivot|--no-new-keyring) shift; continue; ;;
      *) break; ;;
    esac
  done

  ID="$1"

  CFG="$BUNDLE/config.json"
  ROOT=$(jq -r .root.path $CFG)

  # Allow user to enable debug logging
  if [ "$(get_config_env RUNCVM_RUNTIME_DEBUG)" = "1" ]; then
    RUNCVM_DEBUG="1"
  fi

  if debug; then
    log "Command line: $0 ${COMMAND_LINE[@]@Q}"
    log "Command: create bundle=$BUNDLE id=$ID root=$ROOT"
    
    # Save formatted config.json
    jq -r . <$CFG >/tmp/config.json-$$-1
    
  fi
  
  # Pending support for user-specified mountpoint for the guest (VM) binaries and scripts
  set_config_env "RUNCVM_GUEST" "$RUNCVM_GUEST"

  ARG0=$(jq_get "$CFG" '.process.args[0]')
  # Now look in mounts for destination == $ARG0 (this works for Docker and Podman)
  if [ "$ARG0" = "/sbin/docker-init" ] || [ "$ARG0" = "/dev/init" ]; then
  
    # User intended an init process to be run in the container,
    # so arrange to run our own instead, that will launch the original entrypoint
    
    # Look for and remove a mountpoint for this process.
    jq_set "$CFG" --arg init "$ARG0" '(.mounts[] | select(.destination == $init)) |= empty'
    
    # Replace the first argument with our own entrypoint; and remove the second, '--' (for now, #TODO)
    jq_set "$CFG" --arg entrypoint "$RUNCVM_ENTRYPOINT" '.process.args[0] = $entrypoint | del(.process.args[1])'
    
    # We know the user intended an init process to be run in the container.
    # TODO: We might want to indicate this, so that our entrypoint does not skip doing this
    # if the original entrypoint also looks like an init process.
    set_config_env "RUNCVM_INIT" "1"
  else
    # We don't know if the original entrypoint is an init process or not.
    # Run our entrypoint first to work this out and do the right thing.
    
    jq_set "$CFG" --arg entrypoint "$RUNCVM_ENTRYPOINT" '.process.args |= [$entrypoint] + .'
  fi

  # SET RUNCVM_HAS_HOME
  # 
  # If the HOME env var was not set either in the image, or via docker run, 
  # then it will be missing in the config env. Detect this case for communication to runcvm-ctr-entrypoint
  # so that HOME can be set to the requested user's default homedir.
  #
  # - See runcvm-ctr-entrypoint for full details of how/why hasHome is needed and HOME gets set.
  if [ -n "$(get_config_env HOME)" ]; then
    set_config_env "RUNCVM_HAS_HOME" "1"
  else
    set_config_env "RUNCVM_HAS_HOME" "0"
  fi

  # CONFIGURE USER
  # - Must be root to run container
  RUNCVM_UIDGID=$(jq_get "$CFG" '(.process.user.uid | tostring) + ":" + (.process.user.gid | tostring) + ":" + ((.process.user.additionalGids // []) | join(","))')
  set_config_env "RUNCVM_UIDGID" "$RUNCVM_UIDGID"
  jq_set "$CFG" '.process.user = {"uid":0, "gid":0}'
  log "RUNCVM_UIDGID=$RUNCVM_UIDGID"

  # CONFIGURE CPUS
  RUNCVM_CPUS=$(( $(jq_get "$CFG" '.linux.resources.cpu.quota') / 100000))
  set_config_env "RUNCVM_CPUS" "$RUNCVM_CPUS"
  log "RUNCVM_CPUS=$RUNCVM_CPUS"

  # CONFIGURE MOUNTS
  set_config_env "RUNCVM_VM_MOUNTPOINT" "$RUNCVM_VM_MOUNTPOINT"

  # First extract list of tmpfs mounts in fstab form, then delete them from the config
  RUNCVM_TMPFS=$(jq_get "$CFG" '( .mounts[] | select(.type == "tmpfs" and (.destination | test("^/dev(/|$)") | not) ) ) | [.source + " " + .destination + " tmpfs " + (.options | map(select(. != "rprivate" and . != "private")) | join(",")) + " 0 0"] | .[0]')
  jq_set "$CFG" -r 'del( .mounts[] | select(.type == "tmpfs" and (.destination | test("^/dev(/|$)") | not) ) )'
  set_config_env "RUNCVM_TMPFS" "$RUNCVM_TMPFS"

  # Rewrite all pre-existing bind/volume mounts (except those at or below /disks) to mount
  # below $RUNCVM_VM_MOUNTPOINT instead of below /.
  #
  # TODO TO CONSIDER:
  # If we excluded /etc/(resolv.conf,hosts,hostname), and moved these to top of the array
  # (by promoting them at the end of the below statements), they would be present in both
  # container and VM.
  #
  # N.B. A mount at or underneath /disks will NOT be mapped to /vm/disks - this path is reserved for mounting disk files to the container
  jq_set "$CFG" --arg vm "$RUNCVM_VM_MOUNTPOINT" '( .mounts[] | select(.type == "bind" and (.destination | test("^/disks(/|$)") | not) ) ).destination |= $vm + .'

  # Mount / from container to $RUNCVM_VM_MOUNTPOINT, recursively binding all pre-existing mount points
  # (these being only the ones defined ahead of this item in the mounts[] array - so order matters!)
  jq_set "$CFG" --arg root "$ROOT" --arg vm "$RUNCVM_VM_MOUNTPOINT" '.mounts |= [{"destination":$vm,"type":"bind","source":$root,"options":["rbind","private","rw"]}] + .'

  # Mount /opt/runcvm from host to container
  # Define this at top of mounts[] so it is recursively mounted
  # and before (but after in the mounts[] array) /.runcvm so it can be mounted inside it
  jq_set "$CFG" --arg runcvm "$RUNCVM" --arg runcvm_guest "$RUNCVM_GUEST" '.mounts |= [{"destination":$runcvm_guest,"type":"bind","source":$runcvm,"options":["bind","private","ro"]}] + .'

  # Mount a tmpfs at /.runcvm in container
  # Define this at top of mounts[] so it is recursively mounted
  jq_set "$CFG" '.mounts |= [{"destination":"/.runcvm","type":"tmpfs","source":"runcvm","options":["nosuid","noexec","nodev","size=1M","mode=700"]}] + .'

  # Mount a tmpfs at /run in container
  # Define this at bottom of mounts[] so it is not recursively mounted to /vm
  jq_set "$CFG" '.mounts += [{"destination":"/run","type":"tmpfs","source":"run","options":["nosuid","noexec","nodev","size=1M","mode=700"]}]'

  # DETERMINE LAUNCH KERNEL:
  #
  # 1. If RUNCVM_KERNEL specified:
  #    -  or /latest - use latest RUNCVM kernel available for this dist *and* ARGS
  #    - / - use specific RUNCVM kernel version for this dist *and* ARGS
  # 2. Else, check /etc/os-release and:
  #    a. Use builtin kernel for this dist (if present in the expected location) *and* ARGS
  #    b. Use latest RUNCVM kernel available for the dist:
  #      - ID=alpine, VERSION_ID=3.16.0 => alpine/latest
  #      - ID=debian, VERSION_ID=11     => debian/latest
  #      - ID=ubuntu, VERSION_ID=22.04  => ubuntu/latest
  
  # Look for RUNCVM_KERNEL env var
  RUNCVM_KERNEL=$(get_config_env 'RUNCVM_KERNEL')
  log "RUNCVM_KERNEL='$RUNCVM_KERNEL' (1)"

  # Generate:
  # - RUNCVM_KERNEL_ID: the distro name (e.g. alpine, debian, ubuntu)
  # - RUNCVM_KERNEL_IDVER: the distro name and kernel version (e.g. alpine/5.15.59-0-virt, debian/5.10.0-16-amd64)

  if [ -n "$RUNCVM_KERNEL" ]; then
    # If found, validate
  
    if [[ "$RUNCVM_KERNEL" =~ \.\. ]]; then
      error "Kernel '$RUNCVM_KERNEL' invalid (contains '..')"
    fi
  
    if ! [[ "$RUNCVM_KERNEL" =~ ^[a-z]+(/[^/]+)?$ ]]; then
      error "Kernel '$RUNCVM_KERNEL' invalid (should match ^[a-z]+(/[^/]+)?$)"
    fi
  
    if ! [ -d "$RUNCVM_KERNELS/$RUNCVM_KERNEL" ]; then
      error "Kernel '$RUNCVM_KERNEL' not found (check $RUNCVM_KERNELS)"
    fi

    # If RUNCVM_KERNEL is a distro name only, append /latest
    if [[ "$RUNCVM_KERNEL" =~ ^[a-z]+$ ]]; then
      RUNCVM_KERNEL_IDVER="$RUNCVM_KERNEL/latest"
    else
      RUNCVM_KERNEL_IDVER="$RUNCVM_KERNEL"
    fi  

    RUNCVM_KERNEL_ID=$(dirname "$RUNCVM_KERNEL_IDVER") # Returns e.g. alpine, debian, ubuntu

  else
    # If not found, look for value from /etc/os-release in the container image
    
    RUNCVM_KERNEL_ID=$(load_env_from_file "$ROOT/etc/os-release" "ID")

    # Currently unused
    # RUNCVM_KERNEL_OS_VERSION_ID=$(load_var_from_env "$ROOT/etc/os-release" "VERSION_ID")

    # If still not found, assign a default
    if [ -z "$RUNCVM_KERNEL_ID" ]; then
      RUNCVM_KERNEL_ID="${RUNCVM_KERNEL_DEFAULT:-debian}"
    fi

    RUNCVM_KERNEL_IDVER="$RUNCVM_KERNEL_ID/latest"
  fi
  
  log "RUNCVM_KERNEL='$RUNCVM_KERNEL' (2)"
  log "RUNCVM_KERNEL_ID='$RUNCVM_KERNEL_ID'"
  log "RUNCVM_KERNEL_IDVER='$RUNCVM_KERNEL_IDVER'"
  
  # Now look up the default kernel and initramfs paths and args for this kernel
  case "$RUNCVM_KERNEL_ID" in
          debian) RUNCVM_KERNEL_OS_KERNEL_PATH="/vmlinuz"
                  RUNCVM_KERNEL_OS_INITRAMFS_PATH="/initrd.img"
                  RUNCVM_KERNEL_ROOT="rootfstype=virtiofs root=runcvmfs noresume net.ifnames=1"
                  ;;
          ubuntu) RUNCVM_KERNEL_OS_KERNEL_PATH="/boot/vmlinuz"
                  RUNCVM_KERNEL_OS_INITRAMFS_PATH="/boot/initrd.img"
                  RUNCVM_KERNEL_ROOT="rootfstype=virtiofs root=runcvmfs noresume net.ifnames=1"
                  ;;
              ol) RUNCVM_KERNEL_OS_KERNEL_PATH="/boot/vmlinuz"
                  RUNCVM_KERNEL_OS_INITRAMFS_PATH="/boot/initramfs"
                  RUNCVM_KERNEL_ROOT="root=virtiofs:runcvmfs noresume net.ifnames=1"
                  ;;
  alpine|openwrt) RUNCVM_KERNEL_OS_KERNEL_PATH="/boot/vmlinuz-virt"
                  RUNCVM_KERNEL_OS_INITRAMFS_PATH="/boot/initramfs-virt"
                  RUNCVM_KERNEL_ROOT="rootfstype=virtiofs root=runcvmfs resume="
                  ;;

           *) error "Unrecognised image O/S '$RUNCVM_KERNEL'; specify --env=RUNCVM_KERNEL= or --env=RUNCVM_KERNEL=/"; ;;
  esac
  
  # If no RUNCVM_KERNEL specified, look for a kernel and initramfs at the expected paths in the container image.
  if [[ -z "$RUNCVM_KERNEL" && -f "$ROOT/$RUNCVM_KERNEL_OS_KERNEL_PATH" && -f "$ROOT/$RUNCVM_KERNEL_OS_INITRAMFS_PATH" ]]; then
    RUNCVM_KERNEL_PATH="$RUNCVM_KERNEL_OS_KERNEL_PATH"
    RUNCVM_KERNEL_INITRAMFS_PATH="$RUNCVM_KERNEL_OS_INITRAMFS_PATH"
  else
    # If RUNCVM_KERNEL was specified, or we didn't find a kernel and initramfs at the expected paths in the container image,
    # select the latest RUNCVM kernel version and arrange to mount it.

    RUNCVM_KERNEL_VERSION=$(basename $(readlink -f "$RUNCVM_KERNELS/$RUNCVM_KERNEL_IDVER")) # Returns e.g. 5.15.53-0-virt

    RUNCVM_KERNEL_MOUNT_LIB_MODULES=$(get_config_env 'RUNCVM_KERNEL_MOUNT_LIB_MODULES')
    if [ -n "$RUNCVM_KERNEL_MOUNT_LIB_MODULES" ]; then
      RUNCVM_KERNEL_MODULES_SRC="$RUNCVM_KERNELS/$RUNCVM_KERNEL_ID/$RUNCVM_KERNEL_VERSION/modules"
      RUNCVM_KERNEL_MODULES_DST="/lib/modules"
    else
      RUNCVM_KERNEL_MODULES_SRC="$RUNCVM_KERNELS/$RUNCVM_KERNEL_ID/$RUNCVM_KERNEL_VERSION/modules/$RUNCVM_KERNEL_VERSION"
      RUNCVM_KERNEL_MODULES_DST="/lib/modules/$RUNCVM_KERNEL_VERSION"
    fi
    
    RUNCVM_KERNEL_PATH="$RUNCVM_GUEST_KERNELS/$RUNCVM_KERNEL_ID/$RUNCVM_KERNEL_VERSION/vmlinuz"
    RUNCVM_KERNEL_INITRAMFS_PATH="$RUNCVM_GUEST_KERNELS/$RUNCVM_KERNEL_ID/$RUNCVM_KERNEL_VERSION/initrd"

    jq_set "$CFG" --arg modules_dst "$RUNCVM_VM_MOUNTPOINT$RUNCVM_KERNEL_MODULES_DST" --arg modules_src "$RUNCVM_KERNEL_MODULES_SRC" '.mounts += [{"destination":$modules_dst,"type":"bind","source":$modules_src,"options":["bind","private","ro"]}]'
  fi

  log "RUNCVM_KERNEL='$RUNCVM_KERNEL'"
  log "RUNCVM_KERNEL_ID='$RUNCVM_KERNEL_ID'"
  log "RUNCVM_KERNEL_VERSION='$RUNCVM_KERNEL_VERSION'"
  log "RUNCVM_KERNEL_OS_KERNEL_PATH='$RUNCVM_KERNEL_OS_KERNEL_PATH'"
  log "RUNCVM_KERNEL_OS_INITRAMFS_PATH='$RUNCVM_KERNEL_OS_INITRAMFS_PATH'"
  log "RUNCVM_KERNEL_PATH='$RUNCVM_KERNEL_PATH'"
  log "RUNCVM_KERNEL_INITRAMFS_PATH='$RUNCVM_KERNEL_INITRAMFS_PATH'"
  log "RUNCVM_KERNEL_ROOT='$RUNCVM_KERNEL_ROOT'"
  log "RUNCVM_KERNEL_MODULES_SRC='$RUNCVM_KERNEL_MODULES_SRC'"
  log "RUNCVM_KERNEL_MODULES_DST='$RUNCVM_KERNEL_MODULES_DST'"
  
  set_config_env "RUNCVM_KERNEL_PATH" "$RUNCVM_KERNEL_PATH"
  set_config_env "RUNCVM_KERNEL_INITRAMFS_PATH" "$RUNCVM_KERNEL_INITRAMFS_PATH"
  set_config_env "RUNCVM_KERNEL_ROOT" "$RUNCVM_KERNEL_ROOT"

  # Configure devices

  if [ "$(get_config_env 'RUNCVM_QEMU_NET_VHOST')" = "1" ]; then
    jq_set "$CFG" '.linux.resources.devices += [{"allow":true,"type":"c","major":10,"minor":232,"access":"rwm"},{"allow":true,"type":"c","major":10,"minor":200,"access":"rwm"},{"allow":true,"type":"c","major":10,"minor":238,"access":"rwm"}]'
    jq_set "$CFG" '.linux.devices+=[{"path":"/dev/net/tun","type":"c","major":10,"minor":200,"fileMode":8630,"uid":0,"gid":0},{"path":"/dev/kvm","type":"c","major":10,"minor":232,"fileMode":8630,"uid":0,"gid":0},{"path":"/dev/vhost-net","type":"c","major":10,"minor":238,"fileMode":8630,"uid":0,"gid":0}]'
  else
    jq_set "$CFG" '.linux.resources.devices += [{"allow":true,"type":"c","major":10,"minor":232,"access":"rwm"},{"allow":true,"type":"c","major":10,"minor":200,"access":"rwm"}]'
    jq_set "$CFG" '.linux.devices+=[{"path":"/dev/net/tun","type":"c","major":10,"minor":200,"fileMode":8630,"uid":0,"gid":0},{"path":"/dev/kvm","type":"c","major":10,"minor":232,"fileMode":8630,"uid":0,"gid":0}]'
  fi
  
  # For now, hardcode --security-opt=seccomp=unconfined;
  # later, we can work out the minimal seccomp permissions required.
  jq_set "$CFG" '.linux.seccomp |= empty'
  
  # CONFIGURE MEMORY
  # Set /dev/shm to RUNCVM_MEM_SIZE env var, or to default
  # - it should be large enough to support VM memory
  RUNCVM_MEM_LIMIT=$(jq_get "$CFG" '.linux.resources.memory.limit')
  log "RUNCVM_MEM_LIMIT=$RUNCVM_MEM_LIMIT"
  if [ "$RUNCVM_MEM_LIMIT" != "null" ]; then
    RUNCVM_MEM_SIZE="$(( $RUNCVM_MEM_LIMIT/1024/1024 ))"
  else
    RUNCVM_MEM_SIZE="$RUNCVM_MEM_SIZE_DEFAULT"
  fi
  log "RUNCVM_MEM_SIZE=${RUNCVM_MEM_SIZE}M"
  set_config_env "RUNCVM_MEM_SIZE" "${RUNCVM_MEM_SIZE}M"

  RUNCVM_HUGETLB=$(get_config_env "RUNCVM_HUGETLB")
  if [ "$RUNCVM_HUGETLB" != "1" ]; then
    jq_set "$CFG" --arg size "${RUNCVM_MEM_SIZE}M" '( .mounts[] | select(.destination == "/dev/shm") ) = {"destination": "/dev/shm","type": "tmpfs","source": "shm","options": ["nosuid","noexec","nodev","mode=1777","size=" + $size]}'
  # else
    # --shm-size applies; default 64m.
  fi

  # Set the container memory limit with a reasonable additionak reserve to allow for
  # QEMU, virtiofsd, dnsmasq and other container memory footprint.
  jq_set "$CFG" --arg size "$((($RUNCVM_MEM_SIZE + $RUNCVM_CONTAINER_MEM_OVERHEAD)*1024*1024))" '.linux.resources.memory.limit = ($size | tonumber)'

  # Add non-default capabilities needed by:
  # - Docker: CAP_NET_ADMIN
  # - Podman: CAP_NET_ADMIN, CAP_NET_RAW, CAP_MKNOD, CAP_AUDIT_WRITE
  for field in bounding effective permitted
  do
    jq_set "$CFG" --arg field "bounding" '.process.capabilities[$field] |= (.+ ["CAP_NET_ADMIN","CAP_NET_RAW","CAP_MKNOD","CAP_AUDIT_WRITE"] | unique)'
  done
  
  # Filter for RUNCVM_SYS_ADMIN=1
  RUNCVM_SYS_ADMIN=$(get_config_env "RUNCVM_SYS_ADMIN")
  if [ "$RUNCVM_SYS_ADMIN" = "1" ]; then
    # TODO use 'unique'
    jq_set "$CFG" '.process.capabilities.bounding += ["CAP_SYS_ADMIN"] | .process.capabilities.effective += ["CAP_SYS_ADMIN"] | .process.capabilities.permitted += ["CAP_SYS_ADMIN"]'
  fi

  debug && cp -a $CFG /tmp/config.json-$$-2
  
elif [ "$COMMAND" = "exec" ]; then

  debug && log "Command: exec"

  # USAGE:
  #   runc exec [command options]   [command options]  || -p process.json 
  #
  # PARSE 'exec' COMMAND OPTIONS
  # --console-socket value             path to an AF_UNIX socket which will receive a file descriptor referencing the master end of the console's pseudoterminal
  # --cwd value                        current working directory in the container
  # --env value, -e value              set environment variables
  # --tty, -t                          allocate a pseudo-TTY
  # --user value, -u value             UID (format: [:])
  # --additional-gids value, -g value  additional gids
  # --process value, -p value          path to the process.json
  # --detach, -d                       detach from the container's process
  # --pid-file value                   specify the file to write the process id to
  # --process-label value              set the asm process label for the process commonly used with selinux
  # --apparmor value                   set the apparmor profile for the process
  # --no-new-privs                     set the no new privileges value for the process
  # --cap value, -c value              add a capability to the bounding set for the process
  # --preserve-fds value               Pass N additional file descriptors to the container (stdio + $LISTEN_FDS + N in total) (default: 0)
  # --cgroup value                     run the process in an (existing) sub-cgroup(s). Format is [:].
  # --ignore-paused                    allow exec in a paused container    
  while true
  do
    case "$1" in
      --console-socket|--cwd|--env|-e|--user|-u|--additional-gids|-g|--pid-file|--process-label|--apparmor|--cap|-c|--preserve-fds|--cgroup) shift; shift; continue; ;;
      --tty|-t|--detach|-d|--no-new-privs|--ignore-paused) shift; continue; ;;
      --process|-p) shift; PROCESS="$1"; continue; ;;
      *) break; ;;
    esac
  done

  # Allow user to enable debug logging
  if [ "$(get_process_env "$PROCESS" 'RUNCVM_RUNTIME_DEBUG' '0')" = "1" ]; then
    RUNCVM_DEBUG="1"
  fi

  if debug; then
    log "Command line: $0 ${COMMAND_LINE[@]@Q}"
    log "Command: exec process=$PROCESS"
    
    # Save formatted process.json
    jq -r . <$PROCESS >/tmp/process.json-$$-1
  fi

  ARG1=$(jq_get "$PROCESS" '.args[0]')
  if [ "$ARG1" = "---" ]; then
    jq_set "$PROCESS" 'del(.args[0])'
  else
    uidgid=$(jq_get "$PROCESS" '(.user.uid | tostring) + ":" + (.user.gid | tostring) + ":" + ((.user.additionalGids // []) | join(","))')
    cwd=$(jq_get "$PROCESS" '.cwd')
    hasHome=$(get_process_env_boolean "$PROCESS" 'HOME')
    wantsTerminal=$(jq_get "$PROCESS" '.terminal')

    jq_set "$PROCESS" \
      --arg exec "$RUNCVM_EXEC" \
      --arg uidgid "$uidgid" \
      --arg cwd "$cwd" \
      --arg hasHome "$hasHome" \
      --arg wantsTerminal "$wantsTerminal" \
      '.args |= [$exec, $uidgid, $cwd, $hasHome, $wantsTerminal] + .'

    # Force root (or whatever user qemu runs as)
    # Force cwd in the container to / 
    jq_set "$PROCESS" '.user = {"uid":0, "gid":0} | .cwd="/"'
  fi
  
  debug && cp -a $PROCESS /tmp/process.json-$$-2
fi

debug && log "--- LOG ENDS ---"

exec /usr/bin/runc "${COMMAND_LINE[@]}"


================================================
FILE: runcvm-scripts/runcvm-vm-exec
================================================
#!/.runcvm/guest/bin/bash

from_bin() {
  tr '\200\201\202\203\204\205' "\011\012\040\047\042\134"
}

error() {
  echo "OCI runtime exec failed: exec failed: unable to start container process: chdir to cwd (\"$cwd\") set in config.json failed: no such file or directory: unknown"
  exit 126
}

uidgid="$1"
cwd_bin="$2"
shift 2

IFS=':' read -r uid gid additionalGids <<< "$uidgid"

args_bin="$1"
env_bin="$2"

mapfile -t args < <(echo -n "$args_bin" | from_bin)
mapfile -t env < <(echo -n "$env_bin" | from_bin)
cwd=$(echo -n "$cwd_bin" | from_bin)

cd "$cwd" 2>/dev/null && unset OLDPWD || error

# Load original environment
. /.runcvm/config

# Load defaults and aliases
. $RUNCVM_GUEST/scripts/runcvm-ctr-defaults

exec -c $RUNCVM_GUEST/bin/busybox env -i "${env[@]}" $RUNCVM_GUEST/bin/s6-applyuidgid -u $uid -g $gid -G "$additionalGids" "${args[@]}"

================================================
FILE: runcvm-scripts/runcvm-vm-init
================================================
#!/.runcvm/guest/bin/bash -e

# Load original environment
. /.runcvm/config

# Load defaults and aliases
. $RUNCVM_GUEST/scripts/runcvm-ctr-defaults

# Alpine initrd doesn't honour command-line rw flag
mount -o remount,rw /

# FIXME: Something is making /.runcvm ro, so remount it rw
# until such time as exit code handling and dropbear key creation
# obviate the need for this.
mount -o remount,rw /.runcvm

# Alpine initrd doesn't configure /dev device permissions and ownership
# to support non-root users.
if [ "$(findmnt -rnu -o FSTYPE /dev)" = "devtmpfs" ]; then
  [ -e /dev/stdin ] || ln -snf /proc/self/fd/0 /dev/stdin
  [ -e /dev/stdout ] || ln -snf /proc/self/fd/1 /dev/stdout
  [ -e /dev/stderr ] || ln -snf /proc/self/fd/2 /dev/stderr
  [ -e /proc/kcore ] && ln -snf /proc/kcore /dev/core
  [ -h /dev/ptmx ] || ln -snf pts/ptmx /dev/ptmx
  chmod 666 /dev/null /dev/random /dev/urandom /dev/zero /dev/tty /dev/pts/ptmx
  chmod 620 /dev/tty[0-9]*
  chgrp tty /dev/tty*
fi

# Unmount /run if it is a tmpfs (not a virtiofs) mounted by the initramfs
# /run may be populated in the underlying image, and may also be a volume or be bind-mounted,
# and its contents should be accessible in these cases.
if [ "$(findmnt -rnu -o FSTYPE /run)" = "tmpfs" ]; then
  busybox umount -fl /run
fi

# FIXME: virtiofs mounts aren't always made rw. Remount them all rw (if allowed)
# $RUNCVM_GUEST/bin/mount -t virtiofs | awk '{print $3}' | xargs -n 1 mount -o remount,rw

# Some systems do not set up /dev/fd. If needed, add it.
if ! [ -h /dev/fd ]; then
  ln -s /proc/self/fd /dev/fd
fi

# FIXME: This must be run early enough, otherwise other interfaces like docker0 might have started
IF=$(ls /sys/class/net/ | grep -vE '^(lo|docker)' | head -n 1)

# https://bugzilla.redhat.com/show_bug.cgi?id=501934
for i in all $IF
do
  # /sbin/sysctl -q -w -e net.ipv6.conf.$i.disable_ipv6=1 net.ipv6.conf.$i.autoconf=0 net.ipv6.conf.$i.accept_ra=0
  sysctl -q -w -e net.ipv6.conf.$i.disable_ipv6=1 net.ipv6.conf.$i.autoconf=0 || true
done

# Bring up local interface
ip link set lo up

# Identify each interface by MAC address, then give each a temporary name
# (as we might ultimately need to rename e.g. eth0->eth1 and eth1->eth0).
for ifpath in /.runcvm/network/devices/*
do
  if=$(busybox basename "$ifpath")

  [ "$if" = "default" ] && continue

  load_network "$if"

  # Locate the actual network device by its MAC address.
  mac=$(busybox sed -r 's/^..:..:../52:54:00/' <<<$DOCKER_IF_MAC)
  device=$(ip -json link show | jq -r --arg mac "$mac" '.[] | select(.address == $mac) | .ifname')

  ip link set $device name $DOCKER_IF-tmp
done

# Configure, rename and bring up all interfaces.
for ifpath in /.runcvm/network/devices/*
do
  if=$(busybox basename "$ifpath")

  [ "$if" = "default" ] && continue

  load_network "$if"

  ip link set $DOCKER_IF-tmp name $DOCKER_IF
  ip addr add $DOCKER_IF_IP/$DOCKER_IF_IP_NETPREFIX broadcast + dev $DOCKER_IF
  ip link set $DOCKER_IF up mtu "${DOCKER_IF_MTU:=1500}"

  # If this is the default gateway interface, establish the default gateway
  [ -n "$DOCKER_IF_IP_GW" ] && ip route add default via $DOCKER_IF_IP_GW
done

# Read and install any supplementary routes.
while read -r DOCKER_RT_NET DOCKER_RT_GW DOCKER_RT_DEV DOCKER_RT_PREFSRC
do
  [ -n "$DOCKER_RT_NET" ] && [ -n "$DOCKER_RT_GW" ] && [ -n "$DOCKER_RT_DEV" ] && \
    ip route add "$DOCKER_RT_NET" via "$DOCKER_RT_GW" dev "$DOCKER_RT_DEV" || true
done /dev/null | grep ^ssh | cut -d' ' -f2)

# Create json for dropbear EPKA module
cat <<_EOE_ >/.runcvm/dropbear/epka.json && chmod 400 /.runcvm/dropbear/epka.json
[
    {
        "user": "root",
        "keytype": "ssh-rsa",
        "key": "$KEY_PUBLIC",
        "options":"no-X11-forwarding",
        "comments": ""
    }
]
_EOE_

# Load choice of console device
read -r CONSOLE_DEVICE /etc/inittab <<_EOE_
$CONSOLE_DEVICE::respawn:-$RUNCVM_GUEST/scripts/runcvm-vm-start-wrapper
null::respawn:$RUNCVM_GUEST/scripts/runcvm-vm-qemu-ga
null::respawn:$RUNCVM_GUEST/usr/sbin/dropbear -REF -p $SSHD_PORT -A $RUNCVM_GUEST/tmp/dropbear/libepka_file.so,/.runcvm/dropbear/epka.json -P /.runcvm/dropbear/dropbear.pid
null::ctrlaltdel:$RUNCVM_GUEST/bin/poweroff
null::restart:$RUNCVM_GUEST/bin/poweroff
null::shutdown:$RUNCVM_GUEST/bin/poweroff
_EOE_

  # Allow runcvm-vm-start to run once (and only once)
  rm -f /.runcvm/once

  # Clear the environment, and run our own init, disconnecting stdout and stderr from terminal
  exec -c $RUNCVM_GUEST/bin/init &>/dev/null
else
  # If not, assume the user knows what they're doing: launch qemu-ga and just run their entrypoint.

  # Clean RUNCVM env vars
  clean_env

  # Run the qemu guest agent, needed to support future functionality
  $RUNCVM/scripts/runcvm-vm-qemu-ga &>/dev/null &

  # Run dropbear SSH server, needed to support 'docker exec'
  dropbear -REF -p $SSHD_PORT -A $RUNCVM_GUEST/tmp/dropbear/libepka_file.so,/.runcvm/dropbear/epka.json -P /.runcvm/dropbear/dropbear.pid &>/dev/null &

  # Run init from the image
  # Pipe input/output from/to console device
  exec /dev/$CONSOLE_DEVICE
  
  # Invoke runcvm-init with --no-fork purely to create controlling tty,
  # then exec runcvm-vm-start
  exec -c $RUNCVM_GUEST/sbin/runcvm-init --no-fork $RUNCVM_GUEST/scripts/runcvm-vm-start
fi


================================================
FILE: runcvm-scripts/runcvm-vm-qemu-ga
================================================
#!/.runcvm/guest/bin/bash

# Load config
. /.runcvm/config

# Load defaults and aliases
. $RUNCVM_GUEST/scripts/runcvm-ctr-defaults

OPTS=(--retry-path --statedir /.runcvm)

if [ -f "/dev/virtio-ports/org.qemu.guest_agent.0" ]; then
  DEV="/dev/virtio-ports/org.qemu.guest_agent.0"
else
  DEV=$(ls /dev/vport* | head -n 1)
  
  if [ -n "$DEV" ] && [ -c "$DEV" ]; then  
    OPTS+=(-p "$DEV")
  fi
fi

if [ -z "$DEV" ]; then
  exit 0
fi

exec -c "$(which qemu-ga)" "${OPTS[@]}"


================================================
FILE: runcvm-scripts/runcvm-vm-start
================================================
#!/.runcvm/guest/bin/bash

# Load original environment
. /.runcvm/config

# Load defaults and aliases
. $RUNCVM_GUEST/scripts/runcvm-ctr-defaults

if [ -f /.runcvm/once ]; then
  poweroff
  exit 0
else
  touch /.runcvm/once
fi

# Change to saved PWD
cd $(cat /.runcvm/pwd) && unset OLDPWD

# Reload original environment
. /.runcvm/config

# Load original entrypoint
mapfile -t ARGS /dev/null)
    [ -n "$IP" ] && break
    [ $i -eq 1 ] && log "Ingress IP detection for this node failed!" && exit 1
    sleep 0.5
  done
  
  echo -n "$IP" >$FILE
}

cgroupfs_mount

ulimit -u unlimited

modprobe ip_vs

h=$(hostname)

log "Checking network ..."
read -r DOCKER_IF DOCKER_IF_GW \
  <<< $(ip -json route show | jq -j '.[] | select(.dst == "default") | .dev, " ", .gateway')

read -r DOCKER_IF_IP DOCKER_IF_MTU <<< \
  $(ip -json addr show eth0 | jq -j '.[0] | .addr_info[0].local, " ", .mtu')

log "- DOCKER_IF=$DOCKER_IF DOCKER_IF_IP=$DOCKER_IF_IP DOCKER_IF_GW=$DOCKER_IF_GW DOCKER_IF_MTU=$DOCKER_IF_MTU"

# Start dockerd and keep it running
DOCKER_OPTS=(--mtu=$DOCKER_IF_MTU)
DOCKER_OPTS+=(--add-runtime runcvm=/opt/runcvm/scripts/runcvm-runtime)

if [ -n "$REGISTRY_MIRROR" ]; then
  # Replace localhost with custom network gateway, if desired to reach registry running on host network
  DOCKER_OPTS+=(--registry-mirror=$(sed "s|/localhost\b|/$DOCKER_IF_GW|" <<< $REGISTRY_MIRROR))
fi

log "Launching 'dockerd ${DOCKER_OPTS[*]}' ..."
while true; do dockerd "${DOCKER_OPTS[@]}" >>/var/log/dockerd.log 2>&1; done &

for i in $(seq 1 10 | sort -nr)
do
  log "Waiting for dockerd to start (#$i) ..."
  docker ps >/dev/null 2>1 && break
  [ $i -eq 1 ] && exit 1
  sleep 0.5
done

log "dockerd started"

docker info

node_state
log "docker swarm: node state = $NodeState; manager=$IsManager"

log "Creating docker_gwbridge network with MTU $DOCKER_IF_MTU"
docker network create -d bridge \
   --subnet 172.18.0.0/16 \
   --opt com.docker.network.bridge.name=docker_gwbridge \
   --opt com.docker.network.bridge.enable_icc=false \
   --opt com.docker.network.bridge.enable_ip_masquerade=true \
   --opt com.docker.network.driver.mtu=$DOCKER_IF_MTU \
   docker_gwbridge

if [ "$NodeState" = "inactive" ] || [ "$NodeState" = "pending" ]; then

  if [ "$NODE" != "1" ]; then

    for i in $(seq 1 20 | sort -nr)
    do
      log "Waiting for swarm manager startup (#$i) ..."
      [ -f /swarm/worker ] && break
      [ $i -eq 1 ] && exit 1
      sleep 1
    done
  
    log "Swarm manager has started up."
    for i in $(seq 1 20 | sort -nr)
    do
      log "Joining swarm (#$i) ..."
      . /swarm/worker && break
      [ $i -eq 1 ] && exit 1
      sleep 0.5
    done
    
    log "Joined swarm!"
    
  else
  
    log "Initialising swarm ..."
    if ! docker swarm init >/dev/null; then
      log "Swarm initialisation FAILED!"
      exit 1
    fi

    log "Swarm initialised!"
    
    if [ -n "$MTU" ] && [ "$MTU" -gt 0 ]; then
    
      log "Removing default ingress ..."
      echo y | docker network rm ingress

      log "Waiting 3s for ingress removal ..."
      sleep 3
    
      log "Creating new ingress with MTU $DOCKER_IF_MTU"
      docker network create \
        --driver=overlay \
	--ingress \
	--subnet=10.0.0.0/24 \
	--gateway=10.0.0.1 \
	--opt com.docker.network.driver.mtu=$DOCKER_IF_MTU \
	ingress
    fi

    log "Writing swarm 'join token' to shared storage and waiting for other nodes ..."
    mkdir -p /swarm/nodes && docker swarm join-token worker | grep docker >/swarm/worker

    for i in $(seq 1 30 | sort -nr)
    do
      nodes=$(docker node ls --format '{{json .}}' | wc -l)
      log "Waiting for remaining $((NODES-nodes)) of $NODES nodes to join swarm (#$i) ..."
      [ $nodes -eq $NODES ] && break
      [ $i -eq 1 ] && log "Swarm failed!" && exit 1
      sleep 1
    done

    log "Swarm nodes started:"
    docker node ls
    echo
    
  fi

  log "Log memory consumption ..."
  free -m

  # Log this trigger line last BUT before (optionally) running DIRD.
  # This is because the test script waits for this line to appear before proceeding to launch the service.
  # We log multiple times to work around a minor bug whereby the test script sometimes fails to react to the first log line alone.
  for i in $(seq 1 5); do log "Swarm complete!"; sleep 0.25; done
  
  # Optionally run DIRD.
  # Do this after logging "Swarm complete" so that test script proceeds to launch the service;
  # as, until the service is launched, the nodes' ingress network IPs will not yet be defined.
  if [ "$DIRD" = "1" ]; then

    log "Detecting node ingress network IP ..."
    detect_ingress_ip /swarm/nodes/$NODE
    log "Detected node ingress network IP '$(cat /swarm/nodes/$NODE)'"

    log "Waiting for all nodes' ingress network IPs ..."
    for i in $(seq 1 60 | sort -nr)
    do
      ls /swarm/nodes
      [ $(ls /swarm/nodes/ | wc -l) -eq $NODES ] && break
      [ $i -eq 1 ] && log "Ingress IP detection for all nodes failed!" && exit 1
      sleep 0.5
    done

    for n in $(ls /swarm/nodes)
    do
      IPs+="$(cat /swarm/nodes/$n),"
      echo "$n: '$(cat /swarm/nodes/$n)'"
    done
  
    IPs=$(echo $IPs | sed 's/,$//')

    log "Running docker-ingress-routing-daemon --preexisting --ingress-gateway-ips $IPs --install ..."
    ( sleep 1; while true; do /usr/local/bin/docker-ingress-routing-daemon --preexisting --iptables-wait-seconds 3 --ingress-gateway-ips "$IPs" --install; sleep 1; done ) &
  
  fi
    
fi

node_state
if [ "$NodeState" = "active" ] && [ "$IsManager" = "true" ]; then
  log "Manager ready"
fi

log "Looping indefinitely ..."
while true; do sleep infinity; done


================================================
FILE: tests/00-http-docker-swarm/test
================================================
#!/bin/bash -e

# Load framework functions
. ../framework.sh

# TEST VARIABLES
NODE=00-http-docker-swarm-node

# Number of nodes
NODES=${NODES:-3}

# Network MTU to deploy in Docker network, RunCVM container VM nodes, and on Docker and swarm ingress network running on those nodes.
MTU=${MTU:-9000}

# Set to "1" to enable installation of https://github.com/newsnowlabs/docker-ingress-routing-daemon on the swarm
DIRD=${DIRD:-0}

# Set to "1" to disable cleanup of Docker image
NO_CLEAN_IMAGE=${NO_CLEAN_IMAGE:-0}

# OVERRIDE FRAMEWORK FUNCTIONS
nodes() { seq 1 $NODES | sed "s/^/$NODE/"; }
volumes() { echo swarm $(nodes); }
networks() { echo runcvm-mtu; }
images() { echo $IMAGE; }

# Run routine cleanup of any preexisting containers, volumes, networks, and images
cleanup

h=$(hostname)

if [ -n "$REGISTRY_MIRROR" ]; then
  log "REGISTRY_MIRROR '$REGISTRY_MIRROR' detected."
else
  log "No REGISTRY_MIRROR detected: recommend setting REGISTRY_MIRROR=http://localhost:5000 and launching:"
  log "- docker run -d --name=registry --network=host -e REGISTRY_PROXY_REMOTEURL=https://registry-1.docker.io registry:2"
fi

log "Build image ..."
docker build --iidfile /tmp/iid -f node/Dockerfile node/
IMAGE=$(cat /tmp/iid)

if [ -n "$MTU" ] && [ "$MTU" -gt 0 ]; then
  log "Creating network 'runcvm-mtu' with MTU $MTU ..."
  docker network create --opt com.docker.network.driver.mtu=$MTU --scope=local runcvm-mtu
else
  log "Creating network 'runcvm-mtu' with default (unspecified) MTU ..."
  docker network create --scope=local runcvm-mtu
fi

log "Launching $NODES x RunCVM nodes with image $IMAGE ..."
for n in $(seq 1 $NODES)
do
  log -n "Launching RunCVM node $n/$NODES (name=$NODE$n) ... "

  # Enables Docker's use of overlay2 storage driver in a file-backed disk stored in a dedicated Docker volume
  # diskopt="--mount=type=volume,src=$NODE$n,dst=/disks --env=RUNCVM_DISKS=/disks/disk1,/var/lib/docker,ext4,500M"

  # Docker will fall back to using the vfs storage driver, as it detects /var/lib/docker is an overlay2 fs.
  # diskopt="--mount=type=volume,src=$NODE$n,dst=/var/lib/docker"
  
  # Enables Docker's use of overlay2 storage driver in a file-backed disk stored in the container's overlayfs
  diskopt="--env=RUNCVM_DISKS=/disks/disk1,/var/lib/docker,ext4,500M"

  # The swarm volume, mounted at /swarm within the RunCVM VMs, will be used to share swarm info
  # among the nodes.
  docker run \
    -d \
    --rm \
    --runtime=runcvm \
    --network=runcvm-mtu \
    --publish=$((8080+$n-1)):80 \
    --name=$NODE$n \
    --hostname=$NODE$n \
    --memory=512m \
    --env=RUNCVM_VIRTIOFSD_CACHE=none \
    --env=NODE=$n \
    --env=NODES=$NODES \
    --env=MTU=$MTU \
    --env=DIRD=$DIRD \
    --env=REGISTRY_MIRROR=$REGISTRY_MIRROR \
    --mount=type=volume,src=swarm,dst=/swarm \
    $diskopt \
    $IMAGE
done

log "Monitoring ${NODE}1 logs for swarm setup progress ..."
docker logs -f ${NODE}1 -n 0 2>&1 | sed "s/^/> (${NODE}1) > /; /Swarm complete/q0; /Swarm failed/q129;"
log "Finished monitoring ${NODE}1 logs as swarm is set up."

log "Creating http service (please be patient) ..."
docker exec ${NODE}1 docker service create \
  -d \
  --name=http --mode=global -p 80:80 --update-parallelism=0 \
  alpine ash -c "$(tr '\012' ' ' <<_EOE_
apk add --no-cache mini_httpd &&
mkdir -p /www &&
echo -e "#!/bin/sh\n\necho Content-Type: text/plain\necho\necho hostname=\$HOSTNAME remote_addr=\\\$REMOTE_ADDR\nexit 0\n" >/www/index.cgi &&
chmod 755 /www/index.cgi &&
mini_httpd -d /www -D -l /dev/stdout -c '**.cgi'
_EOE_
)"

for i in $(seq 1 200 | sort -nr)
do
  replicas=$(docker exec ${NODE}1 docker service ls --format='{{ .Replicas }}' --filter='Name=http')
  log "Waiting for remainder of $replicas replicas to launch (#$i) ..."
  [ "$replicas" = "$NODES/$NODES" ] && break
  [ $i -eq 1 ] && exit 253
  sleep 1
done
log "All $NODES replicas launched."
sleep 1

# Allow final test to complete, even if we encounter errors
set +e

if [ "$DIRD" = "1" ]; then
  DOCKER_IPV4=$(docker network inspect runcvm-mtu --format='{{(index .IPAM.Config 0).Gateway}}')
else
  DOCKER_IPV4="10.0.0."
fi

log "Running $NODE test looking for '$DOCKER_IPV4' at $(date) ..."

ERRORS=0
TESTS=0
for loop in $(seq 1 250)
do
  i=$((loop % NODES))

  host=http://0.0.0.0:$((8080+i))/

  # Uncomment if running inside a Dockside devtainer (which must be preconnected to a precreated runcvm-mtu Docker network).
  # host=http://$NODE$((i+1)):80/

  response=$(curl --max-time 1 -is $host)
  ERROR=$?

  if [ $ERROR -eq 0 ]; then
    response=$(tr '\012\015' '  ' <<<$response)
    grep -q "remote_addr=$DOCKER_IPV4" <<<$response
    if [ $? -ne 0 ]; then
      log "#$loop Response error: $response"
      ERROR=1
    else
      log "#$loop Response error: $response"
    fi
  else
    log "#$loop Response error: curl error $ERROR"
    ERROR=1
  fi

  ERRORS=$((ERRORS+ERROR))
  TESTS=$((TESTS+1))

done

log "Completed $NODE test $TESTS times, with $ERRORS errors"

# Uncomment to debug:
# log "Falling to shell, type CTRL+D to exit and clean up"; bash -i

sleep 1
exit $ERRORS


================================================
FILE: tests/01-mariadb/test
================================================
#!/bin/bash -e

NODE=01-mariadb

nodes() {
  echo $NODE-mysqld $NODE-mysql
}

volumes() {
  echo ''
}

networks() {
  echo $NODE-network
}

_cleanup() {
  echo "> ($h) Cleaning up nodes ..."
  docker rm -f $(nodes) 2>/dev/null
  echo

  if [ "$(volumes)" != "" ]; then
    echo "> ($h) Cleaning up volumes ..."
    docker volume rm -f $(volumes)
  fi
  echo

  if [ -n "$IMAGE" ]; then
    echo "> ($h) Cleaning up temporary image ..."
    docker rmi $IMAGE
    echo
  fi
  
  rm -f /tmp/iid
  
  if [ "$(networks)" != "" ]; then
    echo "> ($h) Cleaning up networks ..."
    docker network rm $(networks)
  fi
  echo

  echo "> ($h) Cleaned up"
}

cleanup() {
  # Allow this to complete, even if we encounter errors
  set +e

  _cleanup
  
  # Restore setting to fail on error
  set -e
}

quit() {
  # Don't run a second time
  trap '' TERM INT EXIT
  
  cleanup

  echo "> ($h) Exiting with code $ERRORS"
}

trap quit TERM INT EXIT

h=$(hostname)

cleanup

echo "> ($h) Creating network $NODE-network ..."
docker network rm $(networks) 2>/dev/null || true
docker network create $NODE-network

# Launch a mariadb VM using RunCVM
echo "> ($h) Launch RunCVM mariadb server as $NODE-mysqld ..."
docker run --runtime=runcvm -d --rm --name=$NODE-mysqld --hostname=$NODE-mysqld --network=$NODE-network --cpus=1 --memory=1G --env=MARIADB_ALLOW_EMPTY_ROOT_PASSWORD=1 mariadb

echo "> ($h) Monitoring mariadb logs ..."
docker logs -f -t -n 0 $NODE-mysqld &

# Allow final test to complete, even if we encounter errors
set +e

# Launch standard runc container to test connecting to the mariadb VM
echo "> ($h) Waiting for mariadb startup and running test queries in runc container ..."
docker run --rm --network=$NODE-network --name=$NODE-mysql --hostname=$NODE-mysql --env=host=$NODE-mysqld alpine ash -c 'apk update && apk add mariadb-client && for a in $(seq 40 -1 1); do if mysql -P 3306 -h $host mysql -e ""; then echo "> $(hostname) Connected to mysqld ..."; break; else echo "> $(hostname) Waiting for mysqld (#$a) ..."; sleep 1; fi; done && mysql -P 3306 -h $host mysql -e "select count(*) from user"'
ERRORS=$?

echo "> ($h) Completed $NODE test with $ERRORS errors"

# bash -i

sleep 1
exit $ERRORS


================================================
FILE: tests/02-user-workdir/test
================================================
#!/bin/bash -e

# Load framework functions
. ../framework.sh

# TEST VARIABLES
NODE=runcvm-01-test
NETWORK="$NODE-network"
IMAGE="alpine"
RUNTIME="${RUNTIME:-runcvm}"

# OVERRIDE FRAMEWORK FUNCTIONS
nodes() { echo $NODE; }
networks() { echo $NETWORK; }

# TEST DETAILS
COMMAND='echo "$(id -u) $(pwd)"'
USER_ID="1000"
WORK_DIR="/tmp"
EXPECTED_OUTPUT="${USER_ID} ${WORK_DIR}"

# TEST FUNCTIONS
# --------------

# Function to test output against expected values
test_output() {
  local test_type="$1"
  local expected_output="$2"
  local output_to_test="$3"

  if [ "$output_to_test" = "$expected_output" ]; then
    log "docker $test_type test: expected and received '$output_to_test' - PASS"
    return 0
  fi

  log "docker $test_type test: expected '$expected_output', but got: '$output_to_test' - FAIL"
  return 1
}

# TEST PROCEDURE
# --------------

# Run routine cleanup of any preexisting containers, volumes, networks, and images
cleanup

# Create custom network
log -n "Creating network '$NETWORK' ..."
docker network create $NETWORK

# Create and run the container
log -n "Launching runcvm container with command '$COMMAND' ..."
docker run \
  -d \
  --rm \
  --runtime=$RUNTIME \
  --network=$NETWORK \
  --name=$NODE \
  --hostname=$NODE \
  --user=$USER_ID \
  --workdir=$WORK_DIR \
  $IMAGE \
  sh -c "$COMMAND; while true; do echo ===DONE===; sleep 1; done"

shopt -s lastpipe
log "Container '$NODE' output ..."
docker logs -f $NODE 2>&1 | sed "s/^/($NODE) > /; /===DONE===/q0; /failed/q129;"

ERRORS=0

# Test docker run command:
# - Retrieve first line of logs from container
# - Strip carriage returns for now. as it's unclear why they are present and are not present in the expected output
test_output "run" "$EXPECTED_OUTPUT" "$(docker logs $NODE | grep -v '===DONE===' | tr -d '\015')" || ERRORS=$((ERRORS+1))

# Test docker exec command:
# - Retrieve output from exec command for exec test
test_output "exec" "$EXPECTED_OUTPUT" "$(docker exec $NODE sh -c "$COMMAND")" || ERRORS=$((ERRORS+1))

# Final output
log "Tests completed with $ERRORS errors"
exit $ERRORS

================================================
FILE: tests/03-env/test
================================================
#!/bin/bash -e

# Load framework functions
. ../framework.sh

# TEST VARIABLES
NODE=runcvm-01-test
NETWORK="$NODE-network"
IMAGE="alpine"
RUNTIME="${RUNTIME:-runcvm}"

# OVERRIDE FRAMEWORK FUNCTIONS
nodes() { echo $NODE; }
networks() { echo $NETWORK; }

# TEST DETAILS
COMMAND='env | sort'
EXPECTED_OUTPUT="$(echo -e 'HOME=/root\nHOSTNAME=runcvm-01-test\nPATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin\nPWD=/\nSHLVL=1\n')"

# TEST FUNCTIONS
# --------------

# Function to test output against expected values
test_output() {
  local test_type="$1"
  local expected_output="$2"
  local output_to_test="$3"

  if [ "$output_to_test" = "$expected_output" ]; then
    log "docker $test_type test: expected and received '$output_to_test' - PASS"
    return 0
  fi

  log "docker $test_type test: expected '$expected_output', but got: '$output_to_test' - FAIL"
  return 1
}

# TEST PROCEDURE
# --------------

# Run routine cleanup of any preexisting containers, volumes, networks, and images
cleanup

# Create custom network
log -n "Creating network '$NETWORK' ..."
docker network create $NETWORK

# Create and run the container
log -n "Launching runcvm container with command '$COMMAND' ..."
docker run \
  -d \
  --rm \
  --runtime=$RUNTIME \
  --network=$NETWORK \
  --name=$NODE \
  --hostname=$NODE \
  --user=$USER_ID \
  --workdir=$WORK_DIR \
  --init \
  $IMAGE \
  sh -c "$COMMAND; while true; do echo ===DONE===; sleep 1; done"

shopt -s lastpipe
log "Container '$NODE' output ..."
docker logs -f $NODE 2>&1 | sed "s/^/($NODE) > /; /===DONE===/q0;"

ERRORS=0

# Test docker run command:
# - Retrieve first line of logs from container
# - Strip carriage returns for now. as it's unclear why they are present and are not present in the expected output
test_output "run" "$EXPECTED_OUTPUT" "$(docker logs $NODE | grep -v '===DONE===' | tr -d '\015')" || ERRORS=$((ERRORS+1))

# Test docker exec command:
# - Retrieve output from exec command for exec test
test_output "exec" "$EXPECTED_OUTPUT" "$(docker exec $NODE sh -c "$COMMAND")" || ERRORS=$((ERRORS+1))

# Final output
log "Tests completed with $ERRORS errors"
exit $ERRORS

================================================
FILE: tests/framework.sh
================================================
images() { echo; }
nodes() { echo; }
volumes() { echo; }
networks() { echo; }

log() {
  local opts
  if [ "$1" = "-n" ]; then opts="-n"; shift; fi
  echo $opts "> $1"
}

_cleanup() {

  if [ "$(nodes)" != "" ]; then
    log -n "Cleaning up nodes ... "
    docker rm -f $(nodes) 2>&1
  fi

  if [ "$(volumes)" != "" ]; then
    log -n "Cleaning up volumes ... "
    docker volume rm -f $(volumes) 2>&1
  fi

  if [ "$(images)" != "" ] && [ "$NO_CLEAN_IMAGE" != "1" ]; then
    log -n "Cleaning up temporary images ... "
    docker rmi $(images) 2>&1
  fi

  if [ "$(networks)" != "" ]; then
    log -n "Cleaning up networks ... "
    docker network rm $(networks) 2>&1
  fi

  rm -f /tmp/iid
}

cleanup() {
  # Allow this to complete, even if we encounter errors
  set +e

  _cleanup
  
  # Restore setting to fail on error
  set -e
}

quit() {
  local code=$?

  # Don't run a second time
  trap '' TERM INT EXIT
  
  cleanup

  log "Exiting with code $code"
}

term() {
   exit 254
}

# Standard setup

trap quit EXIT
trap term TERM INT QUIT

# Trap for cleanup on exit
trap cleanup EXIT


================================================
FILE: tests/run
================================================
#!/bin/bash -e

ERRORS=0

DIR=$(dirname $0)

if [ -d "$DIR" ]; then
  echo "Running RunCVM integration tests in '$DIR' ..."
  cd $DIR
else
  echo "$0: Error: RunCVM integration test directory '$DIR' not found; aborting!"
  exit -1
fi

for test in *
do

  [ -d "$test" ] || continue;

  cd $test
  ./test 2>&1 | sed "s/^/$test - /"
  TEST_ERRORS=$?
  ERRORS=$((ERRORS+$TEST_ERRORS))

  cd ..
  
  echo "RunCVM test $test finished with $TEST_ERRORS errors"
done

echo "RunCVM integration tests completed with $ERRORS errors"

exit $ERRORS